## Chapter 4. Text versus Bytes

In [55]:
print('\x41')  # utf-8
print('\x41\x00')  # utf-16LE

A
A 
¬
¬ 


In [56]:
# Example 4-1. Encoding and decoding

s = 'café'
print(len(s))

b = s.encode('utf8') 
print(b)
print(len(b))

print(b.decode('utf8'))

4
b'caf\xc3\xa9'
5
café


In [57]:
# Example 4-2. A five-byte sequence as bytes and as bytearray

cafe = bytes('café', encoding='utf_8')
print(cafe)
print(cafe[0])  # each item is an integer in range(256)
print(cafe[:1])  # slices of bytes are also bytes
print(cafe[0] == cafe[:1])  # False

cafe_arr = bytearray(cafe)
print(cafe_arr)
print(cafe_arr[-1:])  # slices of bytearray are also bytearray

b'caf\xc3\xa9'
99
b'c'
False
bytearray(b'caf\xc3\xa9')
bytearray(b'\xa9')


In [58]:
bytes.fromhex('31 4B CE A9')

b'1K\xce\xa9'

In [59]:
# Example 4-3. Initializing bytes from the raw data of an array
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])  # 'h' : short integers (16 bits)
octets = bytes(numbers)
octets

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

### Basic Encoders/Decoders

In [60]:
# Example 4-4. The string “El Niño” encoded with three codecs producing very different byte sequences

for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [62]:
# Example 4-5. Encoding to bytes: success and error handling

city = 'São Paulo'
print(city.encode('utf_8'))
print(city.encode('utf_16'))
print(city.encode('iso8859_1'))
print(city.encode('cp437'))

b'S\xc3\xa3o Paulo'
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'
b'S\xe3o Paulo'


UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [64]:
print(city.encode('cp437', errors='ignore'))
print(city.encode('cp437', errors='replace'))
print(city.encode('cp437', errors='xmlcharrefreplace'))

b'So Paulo'
b'S?o Paulo'
b'S&#227;o Paulo'


In [65]:
# Example 4-6. Decoding from str to bytes: success and error handling
octets = b'Montr\xe9al'
print(octets.decode('cp1252'))
print(octets.decode('iso8859_7'))
print(octets.decode('koi8_r'))
print(octets.decode('utf_8'))

Montréal
Montrιal
MontrИal


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [66]:
print(octets.decode('utf_8', errors='replace'))

Montr�al


In [69]:
# Example 4-7. ola.py: “Hello, World!” in Portuguese
# coding: cp1252
print('Olá, Mundo!')

Olá, Mundo!


In [70]:
# BOM: Byte Order Mark
u16 = 'El Niño'.encode('utf_16')
print(u16)
print(list(u16))

u16le = 'El Niño'.encode('utf_16le')
print(list(u16le))

u16be = 'El Niño'.encode('utf_16be')
print(list(u16be))

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'
[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]
[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]
[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]


### Handling Text Files

In [50]:
# Example 4-8. A platform encoding issue (if you try this on your machine, you may or may not see the problem)

print(open('cafe.txt', 'w', encoding='utf_8').write('café'))
print(open('cafe.txt').read())

4
caf챕


In [None]:
# Example 4-9. Closer inspection of Example 4-8 running on Windows reveals the bug and how to fix it\

fp = open('cafe.txt', 'w', encoding='utf_8')
print(fp)
print(fp.write('café'))  # 4
fp.close()
print()

import os

print(os.stat('cafe.txt').st_size)  # 5

fp2 = open('cafe.txt')
print(fp2)
print(fp2.encoding)
print(fp2.read())

fp3 = open('cafe.txt', encoding='utf_8')
print(fp3)
print(fp3.read())

fp4 = open('cafe.txt', 'rb')
print(fp4)
print(fp4.read())

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>
4

5
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp949'>
cp949
caf챕
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>
café
<_io.BufferedReader name='cafe.txt'>
b'caf\xc3\xa9'


#### Beware of Encoding Defaults

In [2]:
# Example 4-10. Exploring encoding defaults

import locale
import sys

expressions = """
locale.getpreferredencoding()
type(my_file)
my_file.encoding
sys.stdout.isatty()
sys.stdout.encoding
sys.stdin.isatty()
sys.stdin.encoding
sys.stderr.isatty()
sys.stderr.encoding
sys.getdefaultencoding()
sys.getfilesystemencoding()
"""
my_file = open('dummy', 'w')
for expression in expressions.split():
    value = eval(expression)
    print(f'{expression:>30} -> {value!r}')


 locale.getpreferredencoding() -> 'cp949'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp949'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'utf-8'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'utf-8'


In [3]:
# Example 4-12. stdout_check.py

import sys
from unicodedata import name

print(sys.version)
print()
print('sys.stdout.isatty():', sys.stdout.isatty())
print('sys.stdout.encoding:', sys.stdout.encoding)
print()

test_chars = [
'\N{HORIZONTAL ELLIPSIS}', # exists in cp1252, not in cp437
'\N{INFINITY}', # exists in cp437, not in cp1252
'\N{CIRCLED NUMBER FORTY TWO}', # not in cp437 or in cp1252
]

for char in test_chars:
    print(f'Trying to output {name(char)}:')
    print(char)

3.10.7 (tags/v3.10.7:6cc6b13, Sep  5 2022, 14:08:36) [MSC v.1933 64 bit (AMD64)]

sys.stdout.isatty(): False
sys.stdout.encoding: UTF-8

Trying to output HORIZONTAL ELLIPSIS:
…
Trying to output INFINITY:
∞
Trying to output CIRCLED NUMBER FORTY TWO:
㊷


### Normalizing Unicode for Reliable Comparisons

In [4]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'

print(s1, s2)
print(len(s1), len(s2))
print(s1 == s2)  # differ!

café café
4 5
False


In [5]:
from unicodedata import normalize

s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'

print(len(s1), len(s2))
print(len(normalize('NFC', s1)), len(normalize('NFC', s2)))
print(len(normalize('NFD', s1)), len(normalize('NFD', s2)))
print(normalize('NFC', s1) == normalize('NFC', s2))
print(normalize('NFD', s1) == normalize('NFD', s2))

4 5
4 4
5 5
True
True


In [2]:
from unicodedata import normalize, name
ohm = '\u2126'
print(name(ohm))

ohm_c = normalize('NFC', ohm)
print(name(ohm_c))

print(ohm == ohm_c)
print(normalize('NFC', ohm) == normalize('NFC', ohm_c))

OHM SIGN
GREEK CAPITAL LETTER OMEGA
False
True


In [7]:
from unicodedata import normalize, name
half = '\N{VULGAR FRACTION ONE HALF}'

print(half)
print(normalize('NFKC', half))

for char in normalize('NFKC', half):
    print(char, name(char), sep='\t')

½
1⁄2
1	DIGIT ONE
⁄	FRACTION SLASH
2	DIGIT TWO


In [8]:
four_squared = '4²'
print(normalize('NFKC', four_squared))

micro = 'μ'
micro_kc = normalize('NFKC', micro)
print(micro, micro_kc)

print(ord(micro), ord(micro_kc))
print(name(micro), name(micro_kc))

42
μ μ
956 956
GREEK SMALL LETTER MU GREEK SMALL LETTER MU


#### Case Folding

In [3]:
# When str.casefold() and str.lower return different results

micro = 'μ'
print(name(micro))

micro_cf = micro.casefold()
print(name(micro_cf))
print(micro, micro_cf)

eszett = 'ß'
print(name(eszett))

eszett_cf = eszett.casefold()
print(eszett, eszett_cf)

GREEK SMALL LETTER MU
GREEK SMALL LETTER MU
μ μ
LATIN SMALL LETTER SHARP S
ß ss


#### Utility Functions for Normalized Text Matching

In [4]:
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold())

In [6]:
# Example 4-13. normeq.py: normalized Unicode string comparison

# Utility functions for normalized Unicode string comparison.
# Using Normal Form C, case sensitive:
s1 = 'café'
s2 = 'cafe\u0301'
print(s1 == s2)
print(nfc_equal(s1, s2))


print(nfc_equal('A', 'a'))
print()

# Using Normal Form C with case folding:
s3 = 'Straße'
s4 = 'strasse'
print(s3 == s4)
print(nfc_equal(s3, s4))
print(fold_equal(s3, s4))
print(fold_equal(s1, s2))
print(fold_equal('A', 'a'))


False
True
False

False
False
True
True
True


#### Extreme "Normalization": Taking Out Diacritics

In [12]:
# Example 4-14. simplify.py: function to remove all combining marks

import unicodedata
import string

def shave_marks(txt):
    """Remove all diacritic marks"""

    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

In [8]:
# Example 4-15. Two examples using shave_marks from Example 4-14

order = '“Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí.”'
print(shave_marks(order))

Greek = 'Ζέφυρος, Zéfiro'
print(shave_marks(Greek))

“Herr Voß: • ½ cup of OEtker™ caffe latte • bowl of acai.”
Ζεφυρος, Zefiro


In [9]:
# Example 4-16. Function to remove combining marks from Latin characters (import
# statements are omitted as this is part of the simplify.py module from Example 4-14)

def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""

    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue # ignore diacritic on Latin base char
        preserve.append(c)
        # if it isn't a combining char, it's a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)

In [1]:
# Example 4-17. Transform some Western typographical symbols into ASCII (this
# snippet is also part of simplify.py from Example 4-14)

single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""","""'f"^<''""---~>""")
multi_map = str.maketrans({
    '€': 'EUR',
    '…': '...',
    'Æ': 'AE',
    'æ': 'ae',
    'OE': 'OE',
    'oe': 'oe',
    '™': '(TM)',
    '‰': '<per mille>',
    '†': '**',
    '‡': '***',
})

multi_map.update(single_map)

def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""

    return txt.translate(multi_map)


def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

ValueError: string keys in translate table must be of length 1

In [14]:
# Example 4-18. Two examples using asciize from Example 4-17

order = '“Herr Voß: • ½ cup of OEtker™ caffè latte • bowl of açaí.”'
print(dewinize(order))
print(asciize(order))

NameError: name 'dewinize' is not defined

### Sorting Unicode Text

In [2]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)  # unacceptable results with non-ASCII characters

['acerola', 'atemoia', 'açaí', 'caju', 'cajá']

In [3]:
# Example 4-19. locale_sort.py: using the locale.strxfrm function as the sort key

import locale
my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print(my_locale)
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits)

pt_BR.UTF-8
['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


In [5]:
# Example 4-20. Using the pyuca.Collator.sort_key method

import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

### The Unicode Database

In [None]:
# Example 4-21. cf.py: the character finder utility
#!/usr/bin/env python3

import sys
import unicodedata

START, END = ord(' '), sys.maxunicode + 1

def find(*query_words, start=START, end=END):
    query = {w.upper() for w in query_words}
    for code in range(start, end):
        char = chr(code)
        name = unicodedata.name(char, None)
        if name and query.issubset(name.split()):
            print(f'U+{code:04X}\t{char}\t{name}')

def main(words):
    if words:
        find(*words)
    else:
        print('Please provide words to find.')

if __name__ == '__main__':
    main(sys.argv[1:])

In [6]:
# Example 4-22. Demo of Unicode database numerical character metadata (callouts describe each column in the output)
import unicodedata
import re

re_digit = re.compile(r'\d')

sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

for char in sample:
    print(f'U+{ord(char):04x}',
        char.center(6),
        're_dig' if re_digit.match(char) else '-',
        'isdig' if char.isdigit() else '-',
        'isnum' if char.isnumeric() else '-',
        f'{unicodedata.numeric(char):5.2f}',
        unicodedata.name(char),
        sep='\t')

U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX


In [7]:
# Example 4-23. ramanujan.py: compare behavior of simple str and bytes regular expressions
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
" as 1729 = 1³ + 12³ = 9³ + 10³.")

text_bytes = text_str.encode('utf_8')
print(f'Text\n {text_str!r}')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str))
print(' bytes:', re_numbers_bytes.findall(text_bytes))
print('Words')
print(' str :', re_words_str.findall(text_str))
print(' bytes:', re_words_bytes.findall(text_bytes))

Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
 str : ['௧௭௨௯', '1729', '1', '12', '9', '10']
 bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
 str : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
 bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']
