# Unicode and UTF-8

## Unicode

We can get Unicode representation by 
```
ord(char)  # character to unicode, returned number is in decimal

chr(unicode)  # unicode to character
```

In [1]:
ord('a')

97

In [2]:
chr(97)

'a'

## UTF-8

We can get UTF-8 representation by

```
char/string.encode("utf-8")  # character or string to UTF-8. default return in hex

uft8.decode("utf-8") # UTF-8 to character or string
```

In [3]:
"é".encode("utf-8")  # raw byte representation

b'\xc3\xa9'

In [4]:
raw_byte = "é".encode("utf-8")
raw_byte.decode('utf-8')

'é'

In [5]:
# list can convert raw byte representation into readable decimal number
list("é".encode("utf-8"))  

[195, 169]

In [6]:
# encode list number back into raw byte
bytes([195, 169]).decode("utf-8")

'é'

## Compare two encodings

In [7]:
# text from https://www.reedbeta.com/blog/programmers-intro-to-unicode/
text = "Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception."
tokens = text.encode("utf-8") # raw bytes
tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience
print('---')
print(text)
print('---')
print("\nlength of character / Unicode code point:", len(text))
print('---')
print("\nlength of raw bytes:", len(tokens))

---
Ｕｎｉｃｏｄｅ! 🅤🅝🅘🅒🅞🅓🅔‽ 🇺‌🇳‌🇮‌🇨‌🇴‌🇩‌🇪! 😄 The very name strikes fear and awe into the hearts of programmers worldwide. We all know we ought to “support Unicode” in our software (whatever that means—like using wchar_t for all the strings, right?). But Unicode can be abstruse, and diving into the thousand-page Unicode Standard plus its dozens of supplementary annexes, reports, and notes can be more than a little intimidating. I don’t blame programmers for still finding the whole thing mysterious, even 30 years after Unicode’s inception.
---

length of character / Unicode code point: 533
---

length of raw bytes: 616


# Byte-pairing Encoding


In [8]:
def get_stats(tokenised_text):
    """
    Count the pair appearing time
    """
    counts = {}
    for pair in zip(tokenised_text, tokenised_text[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def tokenised_input(old_tokenised_text, pair, new_token_idx):
    """
    Use the new vocabulary to tokenise input. 
    Equivalent to merge the pair into the new token idx.
    
    Args:
    old_tokenised_text: [189, 142, 239, 189, 1, ...]  # tokenised text based on old vocab
    pair: (idx_1, idx_2), most common pair
    new_token_idx: index for new token in the vocabulary
    Returns:
    Inputs tokenised by new vocabulary
    """ 
    new_tokenised_text = []
    i = 0
    while i < len(old_tokenised_text):
        if i < len(old_tokenised_text) - 1 and old_tokenised_text[i] == pair[0] and old_tokenised_text[i+1] == pair[1]:
            new_tokenised_text.append(new_token_idx)
            i += 2
        else:
            new_tokenised_text.append(old_tokenised_text[i])
            i += 1
    return new_tokenised_text

In [9]:
vocab_size = 276 # the desired final vocabulary size
num_merges = vocab_size - 256
tokenised_text = list(tokens) # copy so we don't destroy the original list

merges = {} # (int, int) -> int
for i in range(num_merges):
  stats = get_stats(tokenised_text)
  pair = max(stats, key=stats.get)
  idx = 256 + i
  print(f"merging {pair} into a new token {idx}")
  tokenised_text = tokenised_input(tokenised_text, pair, idx)
  merges[pair] = idx

merging (101, 32) into a new token 256
merging (240, 159) into a new token 257
merging (226, 128) into a new token 258
merging (105, 110) into a new token 259
merging (115, 32) into a new token 260
merging (97, 110) into a new token 261
merging (116, 104) into a new token 262
merging (257, 133) into a new token 263
merging (257, 135) into a new token 264
merging (97, 114) into a new token 265
merging (239, 189) into a new token 266
merging (258, 140) into a new token 267
merging (267, 264) into a new token 268
merging (101, 114) into a new token 269
merging (111, 114) into a new token 270
merging (116, 32) into a new token 271
merging (259, 103) into a new token 272
merging (115, 116) into a new token 273
merging (261, 100) into a new token 274
merging (32, 262) into a new token 275


In [12]:
print("raw bytes length:", len(tokens))
print("encoded token size:", len(tokenised_text))
print(f"compression ratio: {len(tokens) / len(tokenised_text):.2f}X")

raw bytes length: 616
encoded token size: 451
compression ratio: 1.37X
