In [2]:
import os
import gzip

from functools import lru_cache

import ftfy
import html

import re

In [3]:
@lru_cache()
def default_bpe():   # bpe text file을 가져오는 함수
    return os.path.join(os.path.dirnmae(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")

In [4]:
@lru_cache()
def bytes_to_unicode(): # gz파일을 decode 하는 함수
    """
    Return list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 100 token dataset you end up needing around 5K for decent coverage.
    This is a significant percent of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mappingto whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("i"), ord("-")+1))+list(range(ord("o"), ord("y")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

In [5]:
bytes_to_unicode()

{33: '!',
 34: '"',
 35: '#',
 36: '$',
 37: '%',
 38: '&',
 39: "'",
 40: '(',
 41: ')',
 42: '*',
 43: '+',
 44: ',',
 45: '-',
 46: '.',
 47: '/',
 48: '0',
 49: '1',
 50: '2',
 51: '3',
 52: '4',
 53: '5',
 54: '6',
 55: '7',
 56: '8',
 57: '9',
 58: ':',
 59: ';',
 60: '<',
 61: '=',
 62: '>',
 63: '?',
 64: '@',
 65: 'A',
 66: 'B',
 67: 'C',
 68: 'D',
 69: 'E',
 70: 'F',
 71: 'G',
 72: 'H',
 73: 'I',
 74: 'J',
 75: 'K',
 76: 'L',
 77: 'M',
 78: 'N',
 79: 'O',
 80: 'P',
 81: 'Q',
 82: 'R',
 83: 'S',
 84: 'T',
 85: 'U',
 86: 'V',
 87: 'W',
 88: 'X',
 89: 'Y',
 90: 'Z',
 91: '[',
 92: '\\',
 93: ']',
 94: '^',
 95: '_',
 96: '`',
 97: 'a',
 98: 'b',
 99: 'c',
 100: 'd',
 101: 'e',
 102: 'f',
 103: 'g',
 104: 'h',
 105: 'i',
 106: 'j',
 107: 'k',
 108: 'l',
 109: 'm',
 110: 'n',
 111: 'o',
 112: 'p',
 113: 'q',
 114: 'r',
 115: 's',
 116: 't',
 117: 'u',
 118: 'v',
 119: 'w',
 120: 'x',
 121: 'y',
 122: 'z',
 123: '{',
 124: '|',
 125: '}',
 126: '~',
 0: 'Ā',
 1: 'ā',
 2: 'Ă',
 3: '

In [6]:
def get_pairs(word):
    """
    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings).
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

In [7]:
get_pairs("chair")

{('a', 'i'), ('c', 'h'), ('h', 'a'), ('i', 'r')}

In [8]:
def basic_clean(text):  # 그대로 뱉어내줌
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()

In [9]:
# basic_clean("chair") # chair
basic_clean("hello world!!!") # hello world!!!

'hello world!!!'

In [10]:
byte_encoder = bytes_to_unicode()
byte_encoder

{33: '!',
 34: '"',
 35: '#',
 36: '$',
 37: '%',
 38: '&',
 39: "'",
 40: '(',
 41: ')',
 42: '*',
 43: '+',
 44: ',',
 45: '-',
 46: '.',
 47: '/',
 48: '0',
 49: '1',
 50: '2',
 51: '3',
 52: '4',
 53: '5',
 54: '6',
 55: '7',
 56: '8',
 57: '9',
 58: ':',
 59: ';',
 60: '<',
 61: '=',
 62: '>',
 63: '?',
 64: '@',
 65: 'A',
 66: 'B',
 67: 'C',
 68: 'D',
 69: 'E',
 70: 'F',
 71: 'G',
 72: 'H',
 73: 'I',
 74: 'J',
 75: 'K',
 76: 'L',
 77: 'M',
 78: 'N',
 79: 'O',
 80: 'P',
 81: 'Q',
 82: 'R',
 83: 'S',
 84: 'T',
 85: 'U',
 86: 'V',
 87: 'W',
 88: 'X',
 89: 'Y',
 90: 'Z',
 91: '[',
 92: '\\',
 93: ']',
 94: '^',
 95: '_',
 96: '`',
 97: 'a',
 98: 'b',
 99: 'c',
 100: 'd',
 101: 'e',
 102: 'f',
 103: 'g',
 104: 'h',
 105: 'i',
 106: 'j',
 107: 'k',
 108: 'l',
 109: 'm',
 110: 'n',
 111: 'o',
 112: 'p',
 113: 'q',
 114: 'r',
 115: 's',
 116: 't',
 117: 'u',
 118: 'v',
 119: 'w',
 120: 'x',
 121: 'y',
 122: 'z',
 123: '{',
 124: '|',
 125: '}',
 126: '~',
 0: 'Ā',
 1: 'ā',
 2: 'Ă',
 3: '

In [11]:
byte_encoder2 = {v:k for k,v in byte_encoder.items()}
byte_encoder2

{'!': 33,
 '"': 34,
 '#': 35,
 '$': 36,
 '%': 37,
 '&': 38,
 "'": 39,
 '(': 40,
 ')': 41,
 '*': 42,
 '+': 43,
 ',': 44,
 '-': 45,
 '.': 46,
 '/': 47,
 '0': 48,
 '1': 49,
 '2': 50,
 '3': 51,
 '4': 52,
 '5': 53,
 '6': 54,
 '7': 55,
 '8': 56,
 '9': 57,
 ':': 58,
 ';': 59,
 '<': 60,
 '=': 61,
 '>': 62,
 '?': 63,
 '@': 64,
 'A': 65,
 'B': 66,
 'C': 67,
 'D': 68,
 'E': 69,
 'F': 70,
 'G': 71,
 'H': 72,
 'I': 73,
 'J': 74,
 'K': 75,
 'L': 76,
 'M': 77,
 'N': 78,
 'O': 79,
 'P': 80,
 'Q': 81,
 'R': 82,
 'S': 83,
 'T': 84,
 'U': 85,
 'V': 86,
 'W': 87,
 'X': 88,
 'Y': 89,
 'Z': 90,
 '[': 91,
 '\\': 92,
 ']': 93,
 '^': 94,
 '_': 95,
 '`': 96,
 'a': 97,
 'b': 98,
 'c': 99,
 'd': 100,
 'e': 101,
 'f': 102,
 'g': 103,
 'h': 104,
 'i': 105,
 'j': 106,
 'k': 107,
 'l': 108,
 'm': 109,
 'n': 110,
 'o': 111,
 'p': 112,
 'q': 113,
 'r': 114,
 's': 115,
 't': 116,
 'u': 117,
 'v': 118,
 'w': 119,
 'x': 120,
 'y': 121,
 'z': 122,
 '{': 123,
 '|': 124,
 '}': 125,
 '~': 126,
 'Ā': 0,
 'ā': 1,
 'Ă': 2,
 'ă':

In [12]:
merges = gzip.open('bpe_simple_vocab_16e6.txt.gz').read().decode("utf-8").split('\n')
# print(len(merges))  # 262146
print(merges)



In [13]:
print(len(merges))

262146


In [14]:
# print(merges[0]) # bpe_simple_vocab_16e6.txt#version: 0.2
# print(merges[1]) # i n
# print(merges[49152]) # t la</w>
# print(merges[256]) # bu t </w>
# print(merges[2]) # t h
print(merges[1])  # i n

i n


In [15]:
merges2 = merges[1:49152-256-2+1]
# print(len(merges2))  # 48894
print(merges2)

['i n', 't h', 'a n', 'r e', 'a r', 'e r', 'th e</w>', 'in g</w>', 'o u', 'o n', 's t', 'o r', 'e n', 'o n</w>', 'a l', 'a t', 'e r</w>', 'i t', 'i n</w>', 't o</w>', 'r o', 'i s</w>', 'l e', 'i c', 'a t</w>', 'an d</w>', 'e d</w>', 'o f</w>', 'c h', 'o r</w>', 'e s</w>', 'i l', 'e l', 's t</w>', 'a c', 'o m', 'a m', 'l o', 'a n</w>', 'a y</w>', 's h', 'r i', 'l i', 't i', 'f or</w>', 'n e', 'ð Ł', 'r a', 'h a', 'd e', 'o l', 'v e</w>', 's i', 'u r', 'a l</w>', 's e', "' s</w>", 'u n', 'd i', 'b e', 'l a', 'w h', 'o o', 'd ay</w>', 'e n</w>', 'm a', 'n o', 'l e</w>', 't o', 'ou r</w>', 'i r', 'g h', 'w it', 'i t</w>', 'y o', 'a s', 's p', 'th is</w>', 't s</w>', 'at i', 'yo u</w>', 'wit h</w>', 'a d', 'i s', 'a b', 'l y</w>', 'w e', 'th e', 't e', 'a s</w>', 'a g', 'v i', 'p p', 's u', 'h o', 'm y</w>', '. .', 'b u', 'c om', 's e</w>', 'er s</w>', 'm e', 'm e</w>', 'al l</w>', 'c on', 'm o', 'k e</w>', 'g e', 'ou t</w>', 'en t</w>', 'c o', 'f e', 'v er', 'a r</w>', 'f ro', 'a u', 'p o'

In [16]:
# Elements removed from merges that are not in merges2
removed_elements = [item for item in merges if item not in merges2]

# Print the number of removed elements
print("Total removed elements:", len(removed_elements))

# Print the first 10 and last 10 elements (or less if there aren't that many)
print("First 10 removed elements:", removed_elements[:10])
print("Last 10 removed elements:", removed_elements[-10:])

Total removed elements: 213252
First 10 removed elements: ['"bpe_simple_vocab_16e6.txt#version: 0.2', 'ha bib</w>', 'fre ya</w>', 'fjor d</w>', 'ex porter</w>', 'to sa</w>', 'store day</w>', 'maj id</w>', 'ba the</w>', 'cham paign</w>']
Last 10 removed elements: ['southeast ward</w>', 'soccer saturday</w>', 'so zone</w>', 'smid t</w>', 'sm city', 'sli mey</w>', 'sin claire</w>', 'sd reader</w>', 'scare d', '']


In [17]:
# Elements removed from merges that are not in merges2
removed_elements = [item for item in merges if item not in merges2]

# Save the removed elements to a text file
with open('removed_elements.txt', 'w', encoding='utf-8') as f:
    for item in removed_elements:
        f.write(f"{item}\n")

print("Removed elements saved to 'removed_elements.txt'")

Removed elements saved to 'removed_elements.txt'


In [18]:
merges3 = [tuple(merge.split()) for merge in merges2]
print(merges3)


# Print the number of removed elements
print("Total split merges3:", len(merges3))

# Print the first 10 and last 10 elements (or less if there aren't that many)
print("First 10 split merges3:", merges3[:10])
print("Last 10 split merges3:", merges3[-10:])

[('i', 'n'), ('t', 'h'), ('a', 'n'), ('r', 'e'), ('a', 'r'), ('e', 'r'), ('th', 'e</w>'), ('in', 'g</w>'), ('o', 'u'), ('o', 'n'), ('s', 't'), ('o', 'r'), ('e', 'n'), ('o', 'n</w>'), ('a', 'l'), ('a', 't'), ('e', 'r</w>'), ('i', 't'), ('i', 'n</w>'), ('t', 'o</w>'), ('r', 'o'), ('i', 's</w>'), ('l', 'e'), ('i', 'c'), ('a', 't</w>'), ('an', 'd</w>'), ('e', 'd</w>'), ('o', 'f</w>'), ('c', 'h'), ('o', 'r</w>'), ('e', 's</w>'), ('i', 'l'), ('e', 'l'), ('s', 't</w>'), ('a', 'c'), ('o', 'm'), ('a', 'm'), ('l', 'o'), ('a', 'n</w>'), ('a', 'y</w>'), ('s', 'h'), ('r', 'i'), ('l', 'i'), ('t', 'i'), ('f', 'or</w>'), ('n', 'e'), ('ð', 'Ł'), ('r', 'a'), ('h', 'a'), ('d', 'e'), ('o', 'l'), ('v', 'e</w>'), ('s', 'i'), ('u', 'r'), ('a', 'l</w>'), ('s', 'e'), ("'", 's</w>'), ('u', 'n'), ('d', 'i'), ('b', 'e'), ('l', 'a'), ('w', 'h'), ('o', 'o'), ('d', 'ay</w>'), ('e', 'n</w>'), ('m', 'a'), ('n', 'o'), ('l', 'e</w>'), ('t', 'o'), ('ou', 'r</w>'), ('i', 'r'), ('g', 'h'), ('w', 'it'), ('i', 't</w>'), ('y'

In [19]:
print(merges3)

[('i', 'n'), ('t', 'h'), ('a', 'n'), ('r', 'e'), ('a', 'r'), ('e', 'r'), ('th', 'e</w>'), ('in', 'g</w>'), ('o', 'u'), ('o', 'n'), ('s', 't'), ('o', 'r'), ('e', 'n'), ('o', 'n</w>'), ('a', 'l'), ('a', 't'), ('e', 'r</w>'), ('i', 't'), ('i', 'n</w>'), ('t', 'o</w>'), ('r', 'o'), ('i', 's</w>'), ('l', 'e'), ('i', 'c'), ('a', 't</w>'), ('an', 'd</w>'), ('e', 'd</w>'), ('o', 'f</w>'), ('c', 'h'), ('o', 'r</w>'), ('e', 's</w>'), ('i', 'l'), ('e', 'l'), ('s', 't</w>'), ('a', 'c'), ('o', 'm'), ('a', 'm'), ('l', 'o'), ('a', 'n</w>'), ('a', 'y</w>'), ('s', 'h'), ('r', 'i'), ('l', 'i'), ('t', 'i'), ('f', 'or</w>'), ('n', 'e'), ('ð', 'Ł'), ('r', 'a'), ('h', 'a'), ('d', 'e'), ('o', 'l'), ('v', 'e</w>'), ('s', 'i'), ('u', 'r'), ('a', 'l</w>'), ('s', 'e'), ("'", 's</w>'), ('u', 'n'), ('d', 'i'), ('b', 'e'), ('l', 'a'), ('w', 'h'), ('o', 'o'), ('d', 'ay</w>'), ('e', 'n</w>'), ('m', 'a'), ('n', 'o'), ('l', 'e</w>'), ('t', 'o'), ('ou', 'r</w>'), ('i', 'r'), ('g', 'h'), ('w', 'it'), ('i', 't</w>'), ('y'

In [20]:
vocab = list(byte_encoder2.values())
vocab

[33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [21]:
vocab = list(byte_encoder2.values())
# vocab
vocab2 = vocab + [str(v)+'</w>' for v in vocab]
print(vocab2)

[33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [22]:
for merge in merges3:
    vocab.append(''.join(merge))
print(vocab)



In [23]:
print(vocab)



In [24]:
vocab.extend(['<|startoftext|>', '<|endoftext|>'])

In [25]:
encoder_h = dict(zip(vocab, range(len(vocab))))
encoder_h

{33: 0,
 34: 1,
 35: 2,
 36: 3,
 37: 4,
 38: 5,
 39: 6,
 40: 7,
 41: 8,
 42: 9,
 43: 10,
 44: 11,
 45: 12,
 46: 13,
 47: 14,
 48: 15,
 49: 16,
 50: 17,
 51: 18,
 52: 19,
 53: 20,
 54: 21,
 55: 22,
 56: 23,
 57: 24,
 58: 25,
 59: 26,
 60: 27,
 61: 28,
 62: 29,
 63: 30,
 64: 31,
 65: 32,
 66: 33,
 67: 34,
 68: 35,
 69: 36,
 70: 37,
 71: 38,
 72: 39,
 73: 40,
 74: 41,
 75: 42,
 76: 43,
 77: 44,
 78: 45,
 79: 46,
 80: 47,
 81: 48,
 82: 49,
 83: 50,
 84: 51,
 85: 52,
 86: 53,
 87: 54,
 88: 55,
 89: 56,
 90: 57,
 91: 58,
 92: 59,
 93: 60,
 94: 61,
 95: 62,
 96: 63,
 97: 64,
 98: 65,
 99: 66,
 100: 67,
 101: 68,
 102: 69,
 103: 70,
 104: 71,
 105: 72,
 106: 73,
 107: 74,
 108: 75,
 109: 76,
 110: 77,
 111: 78,
 112: 79,
 113: 80,
 114: 81,
 115: 82,
 116: 83,
 117: 84,
 118: 85,
 119: 86,
 120: 87,
 121: 88,
 122: 89,
 123: 90,
 124: 91,
 125: 92,
 126: 93,
 0: 94,
 1: 95,
 2: 96,
 3: 97,
 4: 98,
 5: 99,
 6: 100,
 7: 101,
 8: 102,
 9: 103,
 10: 104,
 11: 105,
 12: 106,
 13: 107,
 14: 108,
 15

In [26]:
decoder_h = {v:k for k, v in encoder_h.items()}
decoder_h

{0: 33,
 1: 34,
 2: 35,
 3: 36,
 4: 37,
 5: 38,
 6: 39,
 7: 40,
 8: 41,
 9: 42,
 10: 43,
 11: 44,
 12: 45,
 13: 46,
 14: 47,
 15: 48,
 16: 49,
 17: 50,
 18: 51,
 19: 52,
 20: 53,
 21: 54,
 22: 55,
 23: 56,
 24: 57,
 25: 58,
 26: 59,
 27: 60,
 28: 61,
 29: 62,
 30: 63,
 31: 64,
 32: 65,
 33: 66,
 34: 67,
 35: 68,
 36: 69,
 37: 70,
 38: 71,
 39: 72,
 40: 73,
 41: 74,
 42: 75,
 43: 76,
 44: 77,
 45: 78,
 46: 79,
 47: 80,
 48: 81,
 49: 82,
 50: 83,
 51: 84,
 52: 85,
 53: 86,
 54: 87,
 55: 88,
 56: 89,
 57: 90,
 58: 91,
 59: 92,
 60: 93,
 61: 94,
 62: 95,
 63: 96,
 64: 97,
 65: 98,
 66: 99,
 67: 100,
 68: 101,
 69: 102,
 70: 103,
 71: 104,
 72: 105,
 73: 106,
 74: 107,
 75: 108,
 76: 109,
 77: 110,
 78: 111,
 79: 112,
 80: 113,
 81: 114,
 82: 115,
 83: 116,
 84: 117,
 85: 118,
 86: 119,
 87: 120,
 88: 121,
 89: 122,
 90: 123,
 91: 124,
 92: 125,
 93: 126,
 94: 0,
 95: 1,
 96: 2,
 97: 3,
 98: 4,
 99: 5,
 100: 6,
 101: 7,
 102: 8,
 103: 9,
 104: 10,
 105: 11,
 106: 12,
 107: 13,
 108: 14,
 10

In [27]:
bpe_ranks_h = dict(zip(merges3, range(len(merges3)))) 
bpe_ranks_h

{('i', 'n'): 0,
 ('t', 'h'): 1,
 ('a', 'n'): 2,
 ('r', 'e'): 3,
 ('a', 'r'): 4,
 ('e', 'r'): 5,
 ('th', 'e</w>'): 6,
 ('in', 'g</w>'): 7,
 ('o', 'u'): 8,
 ('o', 'n'): 9,
 ('s', 't'): 10,
 ('o', 'r'): 11,
 ('e', 'n'): 12,
 ('o', 'n</w>'): 13,
 ('a', 'l'): 14,
 ('a', 't'): 15,
 ('e', 'r</w>'): 16,
 ('i', 't'): 17,
 ('i', 'n</w>'): 18,
 ('t', 'o</w>'): 19,
 ('r', 'o'): 20,
 ('i', 's</w>'): 21,
 ('l', 'e'): 22,
 ('i', 'c'): 23,
 ('a', 't</w>'): 24,
 ('an', 'd</w>'): 25,
 ('e', 'd</w>'): 26,
 ('o', 'f</w>'): 27,
 ('c', 'h'): 28,
 ('o', 'r</w>'): 29,
 ('e', 's</w>'): 30,
 ('i', 'l'): 31,
 ('e', 'l'): 32,
 ('s', 't</w>'): 33,
 ('a', 'c'): 34,
 ('o', 'm'): 35,
 ('a', 'm'): 36,
 ('l', 'o'): 37,
 ('a', 'n</w>'): 38,
 ('a', 'y</w>'): 39,
 ('s', 'h'): 40,
 ('r', 'i'): 41,
 ('l', 'i'): 42,
 ('t', 'i'): 43,
 ('f', 'or</w>'): 44,
 ('n', 'e'): 45,
 ('ð', 'Ł'): 46,
 ('r', 'a'): 47,
 ('h', 'a'): 48,
 ('d', 'e'): 49,
 ('o', 'l'): 50,
 ('v', 'e</w>'): 51,
 ('s', 'i'): 52,
 ('u', 'r'): 53,
 ('a', 'l</w>'):

In [28]:
cache_h = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
# print(type(cache_h)) # dict
cache_h

{'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}

In [29]:
bpe_ranks_h = dict(zip(merges3, range(len(merges3))))
bpe_ranks_h

{('i', 'n'): 0,
 ('t', 'h'): 1,
 ('a', 'n'): 2,
 ('r', 'e'): 3,
 ('a', 'r'): 4,
 ('e', 'r'): 5,
 ('th', 'e</w>'): 6,
 ('in', 'g</w>'): 7,
 ('o', 'u'): 8,
 ('o', 'n'): 9,
 ('s', 't'): 10,
 ('o', 'r'): 11,
 ('e', 'n'): 12,
 ('o', 'n</w>'): 13,
 ('a', 'l'): 14,
 ('a', 't'): 15,
 ('e', 'r</w>'): 16,
 ('i', 't'): 17,
 ('i', 'n</w>'): 18,
 ('t', 'o</w>'): 19,
 ('r', 'o'): 20,
 ('i', 's</w>'): 21,
 ('l', 'e'): 22,
 ('i', 'c'): 23,
 ('a', 't</w>'): 24,
 ('an', 'd</w>'): 25,
 ('e', 'd</w>'): 26,
 ('o', 'f</w>'): 27,
 ('c', 'h'): 28,
 ('o', 'r</w>'): 29,
 ('e', 's</w>'): 30,
 ('i', 'l'): 31,
 ('e', 'l'): 32,
 ('s', 't</w>'): 33,
 ('a', 'c'): 34,
 ('o', 'm'): 35,
 ('a', 'm'): 36,
 ('l', 'o'): 37,
 ('a', 'n</w>'): 38,
 ('a', 'y</w>'): 39,
 ('s', 'h'): 40,
 ('r', 'i'): 41,
 ('l', 'i'): 42,
 ('t', 'i'): 43,
 ('f', 'or</w>'): 44,
 ('n', 'e'): 45,
 ('ð', 'Ł'): 46,
 ('r', 'a'): 47,
 ('h', 'a'): 48,
 ('d', 'e'): 49,
 ('o', 'l'): 50,
 ('v', 'e</w>'): 51,
 ('s', 'i'): 52,
 ('u', 'r'): 53,
 ('a', 'l</w>'):

In [31]:
# pat_h = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
# pat_h

In [32]:
def bpe(self, token):
    # dict형태로 붙여진 token
    if token in self.cache:
        return self.cache[token]
    word = tuple(token[:-1])+(token[-1] + '</w>',) # 문장의 끝에 </w>를 이어붙이기 위해 [:-1]과 [-1]을 이어붙임, 튜플형식으로
    pairs = get_pairs(word)  # token을 이어붙인 word를 [0] [1:] pair로 묶음

In [33]:
word = "chair"
pairs = get_pairs(word)
pairs

{('a', 'i'), ('c', 'h'), ('h', 'a'), ('i', 'r')}

In [34]:
bigram = min(pairs, key = lambda pair: bpe_ranks_h.get(pair, float('inf')))
bigram

('c', 'h')

In [35]:
f = lambda x:x+1
print(f(5))

6


In [36]:
word

'chair'

In [37]:
first, second = bigram 
new_word = []
i = 0
j = word.index(first, i)
# print(first, second) # c h
# print(j) # 0

print(word[i:]) # chair
# new_word.extend(word[i:]) 


# print(word[i]) # c
# print(first) # c

# print(len(word)) # 5

print(word[i+1]) # index가 1이라서 h

# if word[i] == first and i <len(word)-1 and word[i+1] == second:
#         new_word.append(first+second)
#         i += 2

chair
h


In [42]:
def bpe(self, token):
    # dict형태로 붙여진 token
    if token in self.cache:
        return self.cache[token]
    word = tuple(token[:-1])+(token[-1] + '</w>',) # 문장의 끝에 </w>를 이어붙이기 위해 [:-1]과 [-1]을 이어붙임, 튜플형식으로
    pairs = get_pairs(word)  # token을 이어붙인 word를 [0] [1:] pair로 묶음

    """ 주어진 토큰을 BPE(Byte Pair Encoding) 방식으로 인코딩하는 함수, 
    BPE는 하나의 문장이 모두 이어져 있는 것이 아닌 띄어쓰기 등으로 분리되어 있다고 가정             
    """

    if not pairs:
        return token+'</w>' 
    
    while True:
        bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))  # pair가 self.bpe_ranks안에 존재하는지 확인하고, 있다면 반환, 없다면 기본값 float('inf') 무한대를 반환
        # min()을통해 iterable에서 가장 작은 값을 반환
        # bigram의 출력물은 ('c', 'h') <-- 이런 형태로 반환됨
        if bigram not in self.bpe_ranks:  
            break
        first, second = bigram  # 출력물 예시: (c , h) 에서 c가 first, h가 second
        new_word = [] # 병합하는 과정, 단어 형태로 출력됨
        i = 0
        while i < len(word): 
            try:
                j = word.index(first, i)  # first의 index를 정해줌
                # 첫 번째 문자가 나오기 전까지의 문자를 새로운 단어 리스트에 추가
                new_word.extend(word[i:j]) 
                i = j
            except: 
                # 더 이상 찾을 수 없으면 나머지 문자를 모두 추가하고 종료
                new_word.extend(word[i:])
                break

            # 첫 번째 문자가 현재 위치에 있고, 다음 문자가 병합할 두 번째 문자라면 병합
            if word[i] == first and i <len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2   # 병합된 두 문자를 건너뜀
            else:
                new_word.append(word[i])
                i += 1
        
        # 새로운 단어 리스트를 튜플로 변환 (튜플을 사용하는 이유는 불변성 때문)
        new_word = tuple(new_word)
        word = new_word   # new_word를 word로 넣어줌
        if len(word) == 1:
            break
        else:
            # 병합할 새로운 문자 쌍을 다시 구함
            pairs = get_pairs(word)# 위에서 봤던 basic_clean을 거치고 소문자로 바꿔준 후 whitespace_clean을 통해 텍스트 정리
    
    # 최종 병합된 단어를 공백으로 구분된 문자열로 변환
    word = ' '.join(word)
    self.cache[token] = word
    return word

In [None]:
first, second = bigram 
new_word = [] # 병합하는 과정, 단어 형태로 출력됨
i = 0
while i < len(word):  # 단어의 길이가 0보다 클 때
    try:
        j = word.index(first, i)
        new_word.extend(word[i:j])
        i = j 
    except:
        

        new_word.extend(word[i:])
        break

    if word[i] == first and i <len(word)-1 and word[i+1] == second:
        new_word.append(first+second)
        i += 2
    else:
        new_word.append(word[i])
        i += 1
new_word = tuple(new_word)
word2 = new_word
print(len(word2), word2)

4 ('ch', 'a', 'i', 'r')


In [None]:
word2 = ' '.join(word)
word2

'c h a i r'

In [None]:
# byte_encoder

In [None]:
text = 'chair'
token = byte_encoder[b] for b in token.encode('utf-8')  
token

SyntaxError: invalid syntax (2059498874.py, line 2)

In [44]:
def encode(self, text):
    """
    BPE 토큰 -> 텍스트로 인코딩하는 함수
    BPE 방식으로 인코딩된 토큰을 원래의 텍스트로 복원
    """
        
    bpe_tokens = []        
    text = whitespace_clean(basic_clean(text)).lower() # 위에서 봤던 basic_clean을 거치고 소문자로 바꿔준 후 whitespace_clean을 통해 텍스트 정리
    for token in re.findall(self.pat, text): 
        token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))  # 각 토큰을 바이트 단위로 인코딩하고, 이를 유니코드 값으로 변환 (바이트 -> 유니코드 매핑 사용)
        bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))  # 매핑된 토큰을 
    return bpe_tokens

In [None]:
encode()

### CLIP.py

In [None]:
!pip install torchvision



In [None]:
_download("https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", " ")

NameError: name '_download' is not defined

In [None]:
_convert_image_to_rgb(image)

NameError: name '_convert_image_to_rgb' is not defined