# Unicide Text vs Bytes

In [3]:
### encoding an decoding
s = 'café'
print(len(s), s)

encode_s = s.encode('utf8')
print(len(encode_s), encode_s)

decode_s = encode_s.decode('utf8')
print(len(decode_s), decode_s)

4 café
5 b'caf\xc3\xa9'
4 café


In [11]:
## byte (immutable) vs bytearray (mutable)

cafe = bytes('café', encoding='utf8')
print(cafe, len(cafe))
print(cafe[0])

cafe_arr = bytearray('café', encoding='utf8')

print(cafe_arr, len(cafe_arr))
print(cafe_arr[0])

cafe_arr[0] = 65 # mutating
print(cafe_arr, len(cafe_arr))

print(cafe_arr.decode('utf-8'))

b'caf\xc3\xa9' 5
99
bytearray(b'caf\xc3\xa9') 5
99
bytearray(b'Aaf\xc3\xa9') 5
Aafé


In [12]:
# Basic Encoder and Decoders
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec))

latin_1 b'El Ni\xf1o'
utf_8 b'El Ni\xc3\xb1o'
utf_16 b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'


In [14]:
# Reading files
with open("files/cafe.txt", 'w', encoding='utf8') as fw:
    fw.write('café')

In [15]:
with open("files/cafe.txt", 'r', encoding='utf8') as fs:
    print(fs.read())

café


In [16]:
# getting the number of bytes occupied by the text in cafe.txt
import os
print(os.stat("files/cafe.txt").st_size) # note: that the last special character occupies two bytes hence 5

5


In [18]:
# getting the default encoding in Mac Book - happens to be utf-8
fp = open('files/cafe.txt')
print(fp)

<_io.TextIOWrapper name='files/cafe.txt' mode='r' encoding='UTF-8'>


In [21]:
## Sorting unicode text
# The standard way to sort non-ascii text in python is to use the locale.strxfrm - it transform the string to local aware component

import locale

my_locale = locale.setlocale(locale.LC_COLLATE, 'pt_BR.UTF-8')

print(my_locale)

fruits = ['caju', 'atemonia', 'cajá', 'açaí', 'acerola']

sorted_fruits = sorted(fruits, key=locale.strxfrm)

print(sorted_fruits)

pt_BR.UTF-8
['açaí', 'acerola', 'atemonia', 'cajá', 'caju']


In [22]:
!pip install pyuca

Collecting pyuca
  Downloading pyuca-1.2-py2.py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pyuca
Successfully installed pyuca-1.2


In [23]:
# Unicode Collation Algorithm
import pyuca
coll = pyuca.Collator()

sorted_fruits = sorted(fruits, key=coll.sort_key)
print(sorted_fruits)

['açaí', 'acerola', 'atemonia', 'cajá', 'caju']


In [24]:
# The unicode database
# we can use the modules to retrieve character metadata

from unicodedata import name

print(name('A'))

print(name('á'))


LATIN CAPITAL LETTER A
LATIN SMALL LETTER A WITH ACUTE


In [28]:
# String vs Bytes in Regex
""" 
 When building a regex for bytes - its matched with ascii characters
 When building regex for string - its matched with unicode/ascii
"""

import re

re_number_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_number_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

text_str = ('Ramanujan saw \u0be7\u0bed\u0be8\u0bef as 1729 = 1^3 + 12^3 = 9^3 + 10^3.')

text_bytes = text_str.encode('utf8')

print(f"Text\t{text_str!r}")
print('Numbers')
print('str', re_number_str.findall(text_str))
print('bytes', re_number_bytes.findall(text_bytes))

print('String')
print('str', re_words_str.findall(text_str))
print('bytes', re_words_bytes.findall(text_bytes))

Text	'Ramanujan saw ௧௭௨௯ as 1729 = 1^3 + 12^3 = 9^3 + 10^3.'
Numbers
str ['௧௭௨௯', '1729', '1', '3', '12', '3', '9', '3', '10', '3']
bytes [b'1729', b'1', b'3', b'12', b'3', b'9', b'3', b'10', b'3']
String
str ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1', '3', '12', '3', '9', '3', '10', '3']
bytes [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'3', b'12', b'3', b'9', b'3', b'10', b'3']
