In [2]:
s = 'café'
len(s)

4

In [3]:
b = s.encode('utf8')
b

b'caf\xc3\xa9'

In [4]:
len(b)

5

In [5]:
b.decode('utf8')

'café'

In [6]:
cafe = bytes('café', encoding='utf_8')
cafe

b'caf\xc3\xa9'

In [15]:
cafe[0]

99

In [8]:
cafe[:1]

b'c'

In [9]:
cafe_arr = bytearray(cafe)
cafe_arr

bytearray(b'caf\xc3\xa9')

In [10]:
cafe_arr[-1:]

bytearray(b'\xa9')

In [17]:
# https://docs.python.org/3/library/array.html
import array
numbers = array.array('h', [-2,-1,0,1,2]) # 'h' signed short
octects = bytes(numbers)
octects

b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

In [18]:
for encoding in ['latin_1', 'utf_8', 'utf_16']:
    print(encoding, 'El Niño'.encode(encoding), sep='\t')

latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [19]:
city = 'São Paulo'
city.encode('utf_8')

b'S\xc3\xa3o Paulo'

In [20]:
city.encode('utf_16')

b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [21]:
city.encode('iso8859_1')

b'S\xe3o Paulo'

In [22]:
city.encode('cp437') # UnicodeEncodeError can't encode the ã

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [23]:
city.encode('cp437', errors='ignore') # bad idea since it leads to data lose

b'So Paulo'

In [24]:
city.encode('cp437', errors='replace')

b'S?o Paulo'

In [25]:
city.encode('cp437', errors='xmlcharrefreplace')

b'S&#227;o Paulo'

In [26]:
octets = b'Montr\xe9al'
octets.decode('cp1252')

'Montréal'

In [27]:
octets.decode('iso8859_7')

'Montrιal'

In [28]:
# Russian \xe9 И
octets.decode('koi8_r')

'MontrИal'

In [29]:
octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [30]:
# � U+FFFD
octets.decode('utf_8', errors='replace')

'Montr�al'

In [32]:
# if you load a .py file and get an error:
# SyntaxError: Non-UTF-8 code starting with '\xe1'
# you can add a magic coding comment at the top of the file
# coding: cp1252 <---- THIS!
# but its better to convert everything to UTF-8!

In [33]:
# BOM bytes are b'\xff\xfe'. That is a BOM—byte-order mark—denoting the “little-endian” 
u16 = 'El Niño'.encode('utf_16')
u16
# there is no U+FFFE character in Unicode, the byte sequence b'\xff\xfe' must mean the ZERO WIDTH NO-BREAK SPACE

b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [34]:
open('cafe.txt', 'w', encoding='utf_8').write('café')
open('cafe.txt').read() # this works perfectly because I am on a mac (default encoding is utf-8

'café'

In [36]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp

<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>

In [37]:
fp.write('café')

4

In [38]:
fp.close()

In [39]:
import os
os.stat('cafe.txt').st_size # 5 bytes é is 2 bytes: 0xc3 and 0xa9

5

In [40]:
fp2 = open('cafe.txt')
fp2 # on my machine this is still utf-8 by default
# We should never assume locale.getpreferredencoding() default encoding, always specify

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='UTF-8'>

In [41]:
fp2.encoding

'UTF-8'

In [42]:
fp2.read()

'café'

In [44]:
fp3 = open('cafe.txt', encoding='utf_8')
fp3

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>

In [45]:
fp3.read()

'café'

In [47]:
fp4 = open('cafe.txt', encoding='cp1252')
fp4

<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>

In [48]:
fp4.read()

'cafÃ©'

In [53]:
import sys
import unicodedata
print(sys.version)
print()
print('sys.stdout.isatty():', sys.stdout.isatty())
print('sys.stdout.encoding:', sys.stdout.encoding)
print()

test_chars = [
    '\N{HORIZONTAL ELLIPSIS}',
    '\N{INFINITY}',
    '\N{CIRCLED NUMBER FORTY TWO}',
]

for char in test_chars:
    print(f'Trying to output {unicodedata.name(char)}:')
    print(char)

3.10.15 | packaged by conda-forge | (main, Oct 16 2024, 01:24:20) [Clang 17.0.6 ]

sys.stdout.isatty(): False
sys.stdout.encoding: UTF-8

Trying to output HORIZONTAL ELLIPSIS:
…
Trying to output INFINITY:
∞
Trying to output CIRCLED NUMBER FORTY TWO:
㊷


In [54]:
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'
s1, s2

('café', 'café')

In [55]:
len(s1), len(s2)

(4, 5)

In [56]:
s1 == s2

False

In [57]:
# Normalizing (recommended by the W3C)
from unicodedata import normalize
s1 = 'café'
s2 = 'cafe\N{COMBINING ACUTE ACCENT}'
len(s1), len(s2)

(4, 5)

In [58]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))

(4, 4)

In [59]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))

(5, 5)

In [60]:
normalize('NFC', s1) == normalize('NFC', s2)

True

In [61]:
normalize('NFD', s1) == normalize('NFD', s2)

True

In [69]:
# Ω can have many meanings, always good to normalize to avoid surprises
ohm = '\u2126'
unicodedata.name(ohm)

'OHM SIGN'

In [70]:
ohm_c = normalize('NFC', ohm)
unicodedata.name(ohm_c)

'GREEK CAPITAL LETTER OMEGA'

In [71]:
ohm == ohm_c

False

In [72]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)

True

In [73]:
# stronger forms of normailization affecting the capatibility characters
# NFKC, and NFKD causes data loss should only be applied to special cases like search and indexing
# not for storage of text.
half = '\N{VULGAR FRACTION ONE HALF}'
print(half)

½


In [74]:
normalize('NFKC', half)

'1⁄2'

In [75]:
for char in normalize('NFKC', half):
    print(char, unicodedata.name(char), sep='\t')

1	DIGIT ONE
⁄	FRACTION SLASH
2	DIGIT TWO


In [77]:
four_squared = '4²'
normalize('NFKC', four_squared)

'42'

In [90]:
mu = 'µ'
mu_kc = normalize('NFKC', mu)
mu, mu_kc

('µ', 'μ')

In [91]:
ord(mu), ord(mu_kc)

(181, 956)

In [92]:
unicodedata.name(mu), unicodedata.name(mu_kc)

('MICRO SIGN', 'GREEK SMALL LETTER MU')

In [99]:
micro = 'µ'
unicodedata.name(micro)

'MICRO SIGN'

In [100]:
micro_cf = micro.casefold()
unicodedata.name(micro_cf)

'GREEK SMALL LETTER MU'

In [101]:
eszett = 'ß'
unicodedata.name(eszett)

'LATIN SMALL LETTER SHARP S'

In [102]:
eszett_cf = eszett.casefold()
eszett, eszett_cf

('ß', 'ss')

In [103]:
# utility functions
def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() == normalize('NFC', str2).casefold())

In [104]:
# One of Google's search secrets is ignoring diacritics (accents, cedillas etc)
# it is not a proper form of normalization, they come and go in languages and people
# are just lazy it also makes URLs readbale:  São Paulo (https://en.wikipedia.org/wiki/S%C3%A3o_Paulo)
import string

def shave_marks(txt):
    """Remove all diacritic marks"""
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt
                     if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

In [105]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
shave_marks(order)

'“Herr Voß: • ½ cup of Œtker™ caffe latte • bowl of acai.”'

In [106]:
Greek = 'Ζέφυρος, Zéfiro'
shave_marks(Greek)

'Ζεφυρος, Zefiro'

In [113]:
# `shave_marks` also changes non-Latin characters (greek) which never becomes ASCII just by losing their accents
# we need to analyze each base character:
def shave_marks_latin(txt):
    """Remove all diacritic marks from Latin base characters"""
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    preserve = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue # ignore diacritic on latin base char
        preserve.append(c)
        # not combining char, its a new base char
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(preserve)
    return unicodedata.normalize('NFC', shaved)

In [114]:
# Western typographical symbols into ASCII
# mapping table for char-to-char replacement
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", """'f"^<''""---~>""")
# mapping table for char-to-string replacement
multi_map = str.maketrans({
    '€': 'EUR',
    '…': '...',
    'Æ': 'AE',
    'æ': 'ae',
    'Œ': 'OE',
    'œ': 'oe',
    '™': '(TM)',
    '‰': '<per mille>',
    '†': '**',
    '‡': '***',
})
# merge mapping tables
multi_map.update(single_map)
# `dewinize` does not affect ASCII or latin1 only microsoft additions
def dewinize(txt):
    """Win1252 symbols with ASCII"""
    return txt.translate(multi_map)

def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

In [115]:
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
dewinize(order)

'"Herr Voß: - ½ cup of OEtker(TM) caffè latte - bowl of açaí."'

In [116]:
asciize(order)

'"Herr Voss: - 1⁄2 cup of OEtker(TM) caffe latte - bowl of acai."'

In [117]:
# depending on the written language the rules may vary for removing diacritics

In [118]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)

['acerola', 'atemoia', 'açaí', 'caju', 'cajá']

In [119]:
# sorting rules also vary for different locales, the word with a diacritic is sorted after the plain word.
# in this case the list should be sorted as: ['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

In [120]:
# all setlocale(LC_COLLATE, your_locale) before using locale.strxfrm as the key when sorting.
# OS has to support it.
import locale
my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')
print(my_locale)

pt_BR.UTF-8


In [121]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=locale.strxfrm)
print(sorted_fruits) # works for me on macOS v15.1

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']


In [123]:
# pyuca is just python, but may not respect some languages, might want to look into PyICU
import pyuca
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits # in case the sorted didn't work above

['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

In [125]:
# can be used in apps to search emojis
unicodedata.name('📱')

'MOBILE PHONE'

In [126]:
unicodedata.name('💻')

'PERSONAL COMPUTER'

In [127]:
START, END = ord(' '), sys.maxunicode + 1

def find(*query_words, start=START, end=END):
    query = {w.upper() for w in query_words}
    for code in range(start, end):
        char = chr(code)
        name = unicodedata.name(char, None)
        if name and query.issubset(name.split()):
            print(f'U+{code:04X}\t{char}\t{name}')

def main(words):
    if words:
        find(*words)
    else:
        print('Please provide words to find')

if __name__ == '__main__':
    main(sys.argv[1:])

In [128]:
import re

re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"" as 1729 = 1³ + 12³ = 9³ + 10³.")
text_bytes = text_str.encode('utf_8')

print(f'Text\n {text_str!r}')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))
print('  bytes:', re_numbers_bytes.findall(text_bytes))
print('Words')
print('  str  :', re_words_str.findall(text_str))
print('  bytes:', re_words_bytes.findall(text_bytes))

Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
Words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']


In [129]:
# (╯°□°)╯︵ ┻━┻ 