In [1]:
from importlib.metadata import version

import tiktoken
import torch

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.4.1+cu121
tiktoken version: 0.8.0


## Tokenizing text

- [Edith Wharton《The Verdict》](https://en.wikisource.org/wiki/The_Verdict)

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [3]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [4]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'\s', text) # 不加()，則不會保留匹配的內容

print(result)

['Hello,', 'world.', 'This,', 'is', 'a', 'test.']


In [5]:
# split punctuation characters
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [6]:
# remove whitespace
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [7]:
# split question marks, quotation marks, and the double-dashes
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.]|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

result = re.split(r'([,.?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this--', 'a', 'test?']
['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [8]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [9]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("raw text length: ", len(raw_text))
print("the number of tokens: ", len(preprocessed))
print(preprocessed[:30])

raw text length:  20479
the number of tokens:  4649
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


## Converting tokens into token IDs

In [10]:
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)

print(vocab_size)

1159


In [11]:
vocab = {token:integer for integer,token in enumerate(all_words)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Carlo;': 25,
 'Chicago': 26,
 'Claude': 27,
 'Come': 28,
 'Croft': 29,
 'Destroyed': 30,
 'Devonshire': 31,
 'Don': 32,
 'Dubarry': 33,
 'Emperors': 34,
 'Florence': 35,
 'For': 36,
 'Gallery': 37,
 'Gideon': 38,
 'Gisburn': 39,
 'Gisburns': 40,
 'Grafton': 41,
 'Greek': 42,
 'Grindle': 43,
 'Grindle:': 44,
 'Grindles': 45,
 'HAD': 46,
 'Had': 47,
 'Hang': 48,
 'Has': 49,
 'He': 50,
 'Her': 51,
 'Hermia': 52,
 'His': 53,
 'How': 54,
 'I': 55,
 'If': 56,
 'In': 57,
 'It': 58,
 'Jack': 59,
 'Jove': 60,
 'Just': 61,
 'Lord': 62,
 'Made': 63,
 'Miss': 64,
 'Money': 65,
 'Monte': 66,
 'Moon-dancers': 67,
 'Mr': 68,
 'Mrs': 69,
 'My': 70,
 'Never': 71,
 'No': 72,
 'Now': 73,
 'Nutley': 74,
 'Of': 75,
 'Oh': 76,
 'On': 

In [12]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Carlo;', 25)
('Chicago', 26)
('Claude', 27)
('Come', 28)
('Croft', 29)
('Destroyed', 30)
('Devonshire', 31)
('Don', 32)
('Dubarry', 33)
('Emperors', 34)
('Florence', 35)
('For', 36)
('Gallery', 37)
('Gideon', 38)
('Gisburn', 39)
('Gisburns', 40)
('Grafton', 41)
('Greek', 42)
('Grindle', 43)
('Grindle:', 44)
('Grindles', 45)
('HAD', 46)
('Had', 47)
('Hang', 48)
('Has', 49)
('He', 50)


In [13]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [14]:
# class SimpleTokenizerV1:
#     """
#     A simple tokenizer for encoding and decoding text based on a given vocabulary.

#     Attributes:
#     -----------
#     token_to_id : dict
#         A dictionary that maps tokens (words or symbols) to unique IDs.
#     id_to_token : dict
#         A dictionary that maps unique IDs back to tokens.

#     Methods:
#     --------
#     encode(text: str) -> list[int]
#         Encodes a given text into a list of token IDs based on the vocabulary.
#     decode(ids: list[int]) -> str
#         Decodes a list of token IDs back into a text string.
#     """
#     def __init__(self, vocab):
#         """
#         Initializes the tokenizer with a given vocabulary.

#         Parameters:
#         -----------
#         vocab : dict
#             A dictionary mapping tokens (words or symbols) to unique IDs.
#         """
#         # 使用更具描述性的變數名稱，使代碼更加清晰
#         self.token_to_id = vocab
#         self.id_to_token = {idx: token for token, idx in vocab.items()}

#     def encode(self, text):
#         """
#         Encodes a given text into a list of token IDs.

#         The text is split into tokens based on punctuation, whitespace, and special characters.
#         Each token is then mapped to its corresponding ID from the vocabulary.

#         Parameters:
#         -----------
#         text : str
#             The input text to be encoded.

#         Returns:
#         --------
#         list[int]
#             A list of token IDs representing the encoded text.
#         """
#         # 使用正則表達式的括號捕獲標點符號和空白字符
#         tokens = re.split(r'([,.?_!"()\'\']|--|\s)', text)
#         # 過濾掉空字串並移除多餘的空格
#         tokens = [token for token in tokens if token.strip()]
#         # 用 `get()` 來處理可能不在詞彙中的情況，避免程式錯誤
#         try:
#             ids = [self.token_to_id[token] for token in tokens]
#         except KeyError:
#             return None
#         return ids

#     def decode(self, ids):
#         """
#         Decodes a list of token IDs back into a text string.

#         Each ID is mapped back to its corresponding token, and the tokens are joined to form the output text.
#         Punctuation spacing is corrected to ensure natural text formatting.

#         Parameters:
#         -----------
#         ids : list[int]
#             A list of token IDs to be decoded.

#         Returns:
#         --------
#         str
#             The decoded text string.
#         """
#         # 使用生成式來處理編碼到文本的映射
#         tokens = (self.id_to_token.get(idx, "<UNK>") for idx in ids)
#         # 將 tokens 串接起來
#         text = " ".join(tokens)
#         # 移除標點符號前面的空格，使輸出的文本更加自然
#         text = re.sub(r'\s+([,.?"()\'\'])', r'\1', text)
#         return text

In [15]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 1, 69, 7, 39, 873, 1136, 773, 812, 7]


In [16]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [17]:
tokenizer.decode(tokenizer.encode(text))

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

## Adding special context tokens

- `[BOS]` (beginning of sequence) : 表示文章的開始
- `[EOS]` (end of sequence) :  表示文章結束的位置
- `[PAD]` (padding) : 如果訓練 LLM 的批次大於1，由於長度不一，可以使用 [PAD] 作為填充字詞來確保長度一致
- `[UNK]` : 表示未在詞彙表的字詞

- GPT-2 不使用上述任何標記，而只使用 `<|endoftext|>`来降低複雜性，而 `<|endoftext|>` 類似於上文提到的 `[EOS]`
- GPT 也使用 `<|endoftext|>` 作為填充字詞
- GPT-2 不使用 `<UNK>` 字詞來處理詞彙表以外的單字；相反，GPT-2 使用位元組對編碼（Byte pair encoding, BPE）分詞器，它將字詞分解成子詞單元

In [18]:
tokenizer = SimpleTokenizerV1(vocab)

text = "Hello, do you like tea. Is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

In [19]:
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [20]:
len(vocab.items())

1161

In [21]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|endoftext|>', 1159)
('<|unk|>', 1160)


In [22]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int 
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [23]:
# import re

# class SimpleTokenizerV2:
#     """
#     A simple tokenizer for encoding and decoding text based on a given vocabulary.

#     Attributes:
#     -----------
#     token_to_id : dict
#         A dictionary that maps tokens (words or symbols) to unique IDs.
#     id_to_token : dict
#         A dictionary that maps unique IDs back to tokens.

#     Methods:
#     --------
#     encode(text: str) -> list[int]
#         Encodes a given text into a list of token IDs based on the vocabulary.
#     decode(ids: list[int]) -> str
#         Decodes a list of token IDs back into a text string.
#     """
#     def __init__(self, vocab):
#         """
#         Initializes the tokenizer with a given vocabulary.

#         Parameters:
#         -----------
#         vocab : dict
#             A dictionary mapping tokens (words or symbols) to unique IDs.
#         """
#         # 使用更具描述性的變數名稱，使代碼更加清晰。
#         self.token_to_id = vocab
#         self.id_to_token = {idx: token for token, idx in vocab.items()}

#     def encode(self, text):
#         """
#         Encodes a given text into a list of token IDs.

#         The text is split into tokens based on punctuation, whitespace, and special characters.
#         Each token is then mapped to its corresponding ID from the vocabulary.
#         If a token is not in the vocabulary, it is replaced by the special token "<|unk|>".

#         Parameters:
#         -----------
#         text : str
#             The input text to be encoded.

#         Returns:
#         --------
#         list[int]
#             A list of token IDs representing the encoded text.
#         """
#         # 使用正則表達式的括號捕獲標點符號和空白字符
#         tokens = re.split(r'([,.?_!"()'']|--|\s)', text)
#         # 過濾掉空字串並移除多餘的空格
#         tokens = [token for token in tokens if token.strip()]
#         # 使用 "<|unk|>" 來替代不在詞彙中的 token
#         tokens = [token if token in self.token_to_id else "<|unk|>" for token in tokens]
#         ids = [self.token_to_id.get(token, self.token_to_id.get("<|unk|>")) for token in tokens]
#         return ids

#     def decode(self, ids):
#         """
#         Decodes a list of token IDs back into a text string.

#         Each ID is mapped back to its corresponding token, and the tokens are joined to form the output text.
#         Punctuation spacing is corrected to ensure natural text formatting.

#         Parameters:
#         -----------
#         ids : list[int]
#             A list of token IDs to be decoded.

#         Returns:
#         --------
#         str
#             The decoded text string.
#         """
#         # 使用生成式來處理編碼到文本的映射。
#         tokens = (self.id_to_token.get(idx, "<UNK>") for idx in ids)
#         # 將 tokens 串接起來
#         text = " ".join(tokens)
#         # 移除標點符號前面的空格，使輸出的文本更加自然
#         text = re.sub(r'\s+([,.?"()\'\'])', r'\1', text)
#         return text

In [24]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [25]:
tokenizer.encode(text)

[1160,
 5,
 362,
 1155,
 642,
 1000,
 10,
 1159,
 57,
 1013,
 981,
 1009,
 738,
 1013,
 1160,
 7]

In [26]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

## Byte pair encoding

- GPT-2 使用位元組對編碼（BPE）作為分詞器
- 它允許模型將不在其預定義詞彙表中的單字分解為更小的子單字單元甚至單個字符，從而使其能夠處理詞彙表之外的單詞
- e.g. 如果GPT-2 的詞彙表中沒有 "unfamiliarword" 這個詞，它可能會根據訓練有素的 BPE 合併結果，將其標記為["unfam"、"iliar"、"word"] 或其他子字
- 原始BPE 分詞器可在此處找到：[https://github.com/openai/gpt-2/blob/master/src/encoder.py](https://github.com/openai/gpt-2 /blob/master/src/encoder.py)
- 在本章中，我們使用 OpenAI 的開源 [tiktoken](https://github.com/openai/tiktoken) 庫中的 BPE 標記符號生成器，該庫用 Rust 實現了其核心演算法，以提高計算性能

In [27]:
# pip install tiktoken

In [28]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.8.0


In [29]:
tokenizer = tiktoken.get_encoding("gpt2")

In [30]:
text = "Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace."

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [31]:
strings = tokenizer.decode(integers)

print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


In [32]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

[33901, 86, 343, 86, 220, 959]


In [33]:
for i in integers:
    print(f"{i} -> {tokenizer.decode([i])}")

33901 -> Ak
86 -> w
343 -> ir
86 -> w
220 ->  
959 -> ier


In [34]:
strings = tokenizer.decode(integers)
print(strings)

Akwirw ier


### Compare BPE
- https://github.com/datawhalechina/llms-from-scratch-cn/tree/main/Codes/ch02/02_bonus_bytepair-encoder

In [35]:
import tiktoken
# 创建一个使用 GPT-2 模型的编码器对象
tik_tokenizer = tiktoken.get_encoding("gpt2")
# ，定义一个包含文本的字符串变量，使用 tik_tokenizer 对象对文本进行编码
text = "Hello, world. Is this-- a test?"

In [36]:
# 参数 allowed_special，该参数指定哪些特殊字符允许出现在编码结果
integers = tik_tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]


In [37]:
# 进行解码
strings = tik_tokenizer.decode(integers)

print(strings)

Hello, world. Is this-- a test?


In [38]:
# 表示编码器的词汇表大小
print(tik_tokenizer.n_vocab)

50257


In [39]:
"""
Byte pair encoding utilities

Code from https://github.com/openai/gpt-2/blob/master/src/encoder.py

And modified code (download_vocab) from
https://github.com/openai/gpt-2/blob/master/download_model.py

Modified MIT License

Software Copyright (c) 2019 OpenAI

We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
associated documentation files (the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included
in all copies or substantial portions of the Software.
The above copyright notice and this permission notice need not be included
with content created by the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
OR OTHER DEALINGS IN THE SOFTWARE.


"""

import os
import json
import regex as re
import requests
from tqdm import tqdm
from functools import lru_cache

# # 定义一个函数将字节转换为Unicode字符
@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    '''
    返回一组UTF-8字节和相应的Unicode字符串列表。
    可逆的BPE编码适用于Unicode字符串。
    这意味着如果想要避免UNK（未知标记），则需要在词汇表中包含大量的Unicode字符。
    当处理大约100亿标记的数据集时，最终需要大约5000个字符以确保良好的覆盖率。
    这相当于正常情况下使用的32,000个BPE词汇表的显著比例。
    为了避免这种情况，我们希望在UTF-8字节和Unicode字符串之间建立查找表。
    并且要避免将BPE代码映射到空格/控制字符上，以免出现问题。
    '''
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

# 定义一个函数获取单词中的符号对
def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    '''
    返回单词中的符号对集合。
    单词以符号元组的形式表示（其中符号是可变长度的字符串）。
    '''
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

# 定义一个使用字节对编码（BPE）进行编码和解码的Encoder类
class Encoder:
    def __init__(self, encoder, bpe_merges, errors='replace'):
        #  # 使用编码器字典、BPE合并和错误处理策略初始化Encoder
        self.encoder = encoder
        self.decoder = {v:k for k,v in self.encoder.items()}
        self.errors = errors # how to handle errors in decoding
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
        self.cache = {}

        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")

    def bpe(self, token):
        # 对给定的标记执行字节对编码
        if token in self.cache:
            return self.cache[token]
        word = tuple(token)
        pairs = get_pairs(word)

        if not pairs:
            return token

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        # 使用BPE对给定文本进行编码
        bpe_tokens = []
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
         # 将一系列标记解码回文本
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
        return text
# 定义一个函数获取特定模型的编码器
def get_encoder(model_name, models_dir):
    with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
        encoder = json.load(f)
    with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
        bpe_data = f.read()
    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
    return Encoder(
        encoder=encoder,
        bpe_merges=bpe_merges,
    )

# 定义一个函数下载GPT-2模型的词汇文件
def download_vocab():
    # Modified code from
    subdir = 'gpt2_model'
    if not os.path.exists(subdir):
        os.makedirs(subdir)
    subdir = subdir.replace('\\','/') # needed for Windows

    for filename in ['encoder.json', 'vocab.bpe']:

        r = requests.get("https://openaipublic.blob.core.windows.net/gpt-2/models/117M" + "/" + filename, stream=True)

        with open(os.path.join(subdir, filename), 'wb') as f:
            file_size = int(r.headers["content-length"])
            chunk_size = 1000
            with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
                # 1k for chunk_size, since Ethernet packet size is around 1500 bytes
                for chunk in r.iter_content(chunk_size=chunk_size):
                    f.write(chunk)
                    pbar.update(chunk_size)


ModuleNotFoundError: No module named 'tqdm'

## Data sampling with a sliding window

In [40]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [41]:
tokenizer

<Encoding 'gpt2'>

In [42]:
enc_text

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [43]:
enc_sample = enc_text[50:]

In [44]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [45]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [46]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [47]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.4.1+cu121


In [51]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.target_ids = []

        # 對文章進行斷詞
        token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})

        # 使用 sliding window 將文章分割為最大長度的重疊序列
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [49]:
# pytorch GPU
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True):

    # 斷詞器初始化
    tokenizer = tiktoken.get_encoding("gpt2") # return tiktoken.core 中 class Encoding

    # 建立數據集(繼承Dataset)
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # 建立 Data Loader
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle, 
        drop_last=drop_last
        )

    return dataloader

In [None]:
# # pytorch CPU
# def create_dataloader_v1(txt, batch_size=4, max_length=256, 
#                          stride=128, shuffle=True, drop_last=True,
#                          num_workers=0):

#     # Initialize the tokenizer
#     tokenizer = tiktoken.get_encoding("gpt2")

#     # Create dataset
#     dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

#     # Create dataloader
#     dataloader = DataLoader(
#         dataset,
#         batch_size=batch_size,
#         shuffle=shuffle,
#         drop_last=drop_last,
#         num_workers=num_workers # 多核CPU
#     )

#     return dataloader

In [50]:
# from torch.utils.data import Dataset, DataLoader
# from tiktoken.core import Encoding
# from typing import List, Tuple

# class GPTDatasetV1(Dataset):
#     """
#     A custom dataset class for preparing text data for training GPT-like models.

#     Attributes:
#     -----------
#     input_ids : list[torch.Tensor]
#         List of input tensors for training.
#     target_ids : list[torch.Tensor]
#         List of target tensors for training.

#     Methods:
#     --------
#     __len__() -> int
#         Returns the number of samples in the dataset.
#     __getitem__(idx: int) -> tuple[torch.Tensor, torch.Tensor]
#         Returns the input and target tensors for a given index.
#     """
#     def __init__(self, txt: str, tokenizer: Encoding, max_length: int, stride: int):
#         """
#         Initializes the dataset by splitting the text into overlapping sequences.

#         Parameters:
#         -----------
#         txt : str
#             The input text to be tokenized and divided into training sequences.
#         tokenizer : Encoding
#             A tokenizer object from the `tiktoken` library that provides an `encode` method to convert text into a list of token IDs.
#         max_length : int
#             The maximum length of each training sequence.
#         stride : int
#             The step size for the sliding window to create overlapping sequences.
#         """
#         self.tokenizer = tokenizer
#         self.input_ids: List[torch.Tensor] = []
#         self.target_ids: List[torch.Tensor] = []

#         # 對全部文本進行分詞，並確保特殊符號被允許
#         token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})

#         # 使用滑動窗口將文本分割為最大長度的重疊序列
#         for i in range(0, len(token_ids) - max_length, stride):
#             input_chunk = token_ids[i:i + max_length]
#             target_chunk = token_ids[i + 1: i + max_length + 1]
#             self.input_ids.append(torch.tensor(input_chunk, dtype=torch.long))
#             self.target_ids.append(torch.tensor(target_chunk, dtype=torch.long))

#     def __len__(self) -> int:
#         """
#         Returns the number of samples in the dataset.
#         """
#         return len(self.input_ids)

#     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
#         """
#         Returns the input and target tensors for a given index.

#         Parameters:
#         -----------
#         idx : int
#             The index of the data sample to be retrieved.

#         Returns:
#         --------
#         tuple[torch.Tensor, torch.Tensor]
#             The input and target tensors for the specified index.
#         """
#         return self.input_ids[idx], self.target_ids[idx]

- Let's test the dataloader with a batch size of 1 for an LLM with a context size of 4:

In [52]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [53]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [54]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [55]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Creating token embeddings

- 使用嵌入層將這些詞元嵌入到連續的向量表示中
- 通常，這些嵌入層是大型語言模型本身的一部分，在模型訓練期間會進行更新（訓練）

In [56]:
input_ids = torch.tensor([5, 1, 3, 2])

In [57]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer

Embedding(6, 3)

In [59]:
print(embedding_layer.weight)
print(embedding_layer.weight.shape)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)
torch.Size([6, 3])


- 對於熟悉 one-hot encoding 的人來說，上述的嵌入層方法本質上只是一種更有效率的實現方式，它相當於在全連接層中先進行 one-hot encoding，然後進行矩陣乘法
- 因為嵌入層只是一種更有效率的實現方式，它等同於獨熱編碼和矩陣乘法的方法，所以它可以被視為一個可以透過反向傳播進行最佳化的神經網路層

### embedding-vs-matmul

In [61]:
# 利用 id 從 embedding 取值(token vector)
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [63]:
input_ids

tensor([5, 1, 3, 2])

In [62]:
print(embedding_layer(input_ids))

tensor([[-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010],
        [-0.4015,  0.9666, -1.1481],
        [ 1.2753, -0.2010, -0.1606]], grad_fn=<EmbeddingBackward0>)


## Encoding word positions

In [69]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [70]:
print(token_embedding_layer.weight)
print(token_embedding_layer.weight.shape)

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        ...,
        [-0.1769,  0.1951,  0.2734,  ..., -0.6994, -0.8924,  0.7343],
        [-0.0437,  2.3365, -0.8838,  ...,  0.0348, -0.1641, -1.6272],
        [-0.1025, -0.5542, -1.7585,  ...,  0.7799, -1.9565, -0.2441]],
       requires_grad=True)
torch.Size([50257, 256])


In [71]:
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=5, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [72]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 3619,   402,   271, 10899],
        [  257,  7026, 15632,   438],
        [  257,   922,  5891,  1576],
        [  568,   340,   373,   645],
        [ 5975,   284,   502,   284],
        [  326,    11,   287,   262],
        [  286,   465, 13476,    11]])

Inputs shape:
 torch.Size([8, 4])


In [73]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


 - GPT-2 uses `absolute position embeddings`, so we just create another embedding layer:

In [74]:
block_size = max_length
pos_embedding_layer = torch.nn.Embedding(block_size, output_dim)

In [77]:
print(pos_embedding_layer.weight)
print(pos_embedding_layer.weight.shape)

Parameter containing:
tensor([[-1.4693,  1.0024,  0.6403,  ..., -0.7098, -0.4741,  1.3287],
        [-0.3833,  0.5006,  2.1007,  ..., -0.1256,  0.8334, -1.8840],
        [ 0.3221,  0.9576, -1.5949,  ...,  0.4771, -0.7206,  0.2753],
        [ 0.1482, -1.1207,  1.1867,  ...,  0.5207, -1.0125, -0.3823]],
       requires_grad=True)
torch.Size([4, 256])


In [78]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


 - To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:

In [79]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
