# Week 2: Turning Words into Tokens

In this notebook, we will explore the process of converting text into tokens, a fundamental step in NLP tasks.

## 0. Setup

We will begin by importing the necessary libraries.

In [1]:
# Import necessary libraries
import re
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [2]:
# Import file checking whether TODO has been removed
#from helpers.check_todo import check_implementation

In [3]:
# Create the src directory if it doesn't exist
import os
os.makedirs('src', exist_ok=True)

## 1. Running Simple Tokenization

This section demonstrates a basic approach to tokenization using Python's built-in libraries and PyTorch. We will implement a basic tokenization function. This function will split the text into individual tokens.

In [4]:
sample_text = "Hello, how are you doing today?"

In [20]:
code_text = """
def calculate_llm_perplexity(model, text, max_length=1024):
    tokens = tokenizer.encode(text, max_length=max_length, truncation=True)
    input_ids = torch.tensor([tokens]).to(device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    loss = outputs.loss
    return math.exp(loss.item())

# Example usage
perplexity = calculate_llm_perplexity(gpt2_model, "Hello, world!")
print(f"Perplexity: {perplexity:.2f}")
"""

In [6]:
def tokenize(text):
    # Hint: Use regex to split the text into words and punctuation
    split = map(str.strip, re.split("(\s+|,+|\.+|\!+|\?+|=+|\(+|\)+|\[+|\]+)", text))
    tokens = [s for s in split if s]
    return tokens

Our time to test whether you have reviewed 'TODO' in the first function we implement together. Remove 'TODO' once you're done implementing and no error message will appear.

In [7]:
try:
    # check_implementation(tokenize)
    pass
except NotImplementedError as e:
    print(e)

In [8]:
print("Tokenized text:", tokenize(sample_text))

Tokenized text: ['Hello', ',', 'how', 'are', 'you', 'doing', 'today', '?']


In [9]:
print("Tokenized code:", tokenize(code_text))

Tokenized code: ['def', 'calculate_llm_perplexity', '(', 'model', ',', 'text', ',', 'max_length', '=', '1024', ')', ':', 'tokens', '=', 'tokenizer', '.', 'encode', '(', 'text', ',', 'max_length', '=', 'max_length', ',', 'truncation', '=', 'True', ')', 'input_ids', '=', 'torch', '.', 'tensor', '(', '[', 'tokens', ']', ')', '.', 'to', '(', 'device', ')', 'with', 'torch', '.', 'no_grad', '(', ')', ':', 'outputs', '=', 'model', '(', 'input_ids', ',', 'labels', '=', 'input_ids', ')', 'loss', '=', 'outputs', '.', 'loss', 'return', 'math', '.', 'exp', '(', 'loss', '.', 'item', '(', '))', '#', 'Example', 'usage', 'perplexity', '=', 'calculate_llm_perplexity', '(', 'gpt2_model', ',', '"Hello', ',', 'world', '!', '"', ')', 'print', '(', 'f"Perplexity:', '{perplexity:', '.', '2f}"', ')']


## 2. Creating a Vocabulary

In this section we will create a function that takes a list of texts as input and returns a dictionary. In it each key is a unique word (or token) from the texts and its corresponding value is a unique index. The function should also reserve a special token <UNK> with index 0 to represent unknown words that may appear in future texts.

In [10]:
def build_vocabulary(texts):

    # Do not forget to reserve a slot for unknown tokens
    tokens = [token for text in texts for token in tokenize(text)]
    unique_tokens = set(tokens)
    vocab = {token: id for id, token in enumerate(unique_tokens)}
    vocab["[UNK]"] = len(vocab)

    return vocab

In [11]:
try:
    # check_implementation(build_vocabulary)
    pass
except NotImplementedError as e:
    print(e)

In [12]:
# TODO: Use your examples for a sample dataset
# We won't be checking whether you have removed TODO here
# But using your own sentences is encouraged!

sample_dataset = [
    "42 is the Ultimate answer for Life, the Universe, and Everything.",
    "Hello, world of LLM Trailblazers! This is another example.",
    "What is the weather like today in Munich?"
]

In [13]:
vocab = build_vocabulary(sample_dataset)
print("Vocabulary:", vocab)

Vocabulary: {'Universe': 0, 'weather': 1, 'world': 2, 'for': 3, 'What': 4, 'This': 5, 'like': 6, 'the': 7, 'of': 8, '?': 9, 'is': 10, 'Life': 11, '42': 12, '!': 13, 'Ultimate': 14, 'another': 15, 'answer': 16, 'example': 17, 'today': 18, ',': 19, 'Everything': 20, 'and': 21, 'LLM': 22, '.': 23, 'in': 24, 'Trailblazers': 25, 'Hello': 26, 'Munich': 27, '[UNK]': 28}


## 3. Implementing a Custom Dataloader

We have a lot of text data, but it's all different lengths. We need to make it work for our model. To do this, we'll create two special helpers:

1. A `Dataset` class: This will help us prepare our text data for our model. We'll break down the text into smaller pieces and convert it into a format our model can understand.
2. A `DataLoader` class: This will help us feed our prepared data to our model in batches. We'll sort the batches by length, add padding to make them all the same size, and create a mask to ignore the extra padding.

By using these two helpers, we'll be able to get our data in order and make it easy for our model to work with. This will make our training process smoother and more efficient.

In [14]:
class TextDataset(Dataset):
    def __init__(self, texts, vocab):
        """
        Initialize the dataset with texts and vocabulary.

        :param texts: A list of text samples.
        :param vocab: A dictionary representing the vocabulary, where keys are tokens and values are their corresponding IDs.
        """
        self.texts = texts
        self.vocab = vocab
        self.unk_id = self.vocab["[UNK]"]
        assert len(self.vocab) == (self.unk_id + 1)
        self.stride=1
        self.sample_len=4
        self.__prepare__()

    def __prepare__(self):
        for text in self.texts:
          token_ids = [self.vocab.get(token, self.unk_id) for token in tokenize(text)]

    def __len__(self):

        return len(self.texts)

    def __getitem__(self, idx):

        tokens = [self.vocab.get(token, self.unk_id) for token in tokenize(self.texts[idx])]
        return tokens[:-1], tokens[-1]

In [None]:
try:
    #check_implementation(TextDataset)
    pass
except NotImplementedError as e:
    print(e)

In [None]:
# Create a dataset instance
dataset = TextDataset(sample_dataset, vocab)

In [None]:
batch_size = 2
simple_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Display a batch of data
for batch in simple_dataloader:
    print("Batch shape:", batch.shape)
    print("Sample batch:", batch)
    break

RuntimeError: each element in list of batch should be of equal size

In [None]:
print("Attempting to iterate through the dataloader:")
try:
    for batch in simple_dataloader:
        print("Processed batch:", batch)
        break
except RuntimeError as e:
    print(f"Caught an error: {e}")
    print("\nThis error occurs because we're trying to batch sequences of different lengths.")

Attempting to iterate through the dataloader:
Processed batch: tensor([28, 28])


Now, let's implement a custom collate_fn to handle variable-length sequences.

In [None]:
def collate_fn(batch):
    # Separate the input sequences and targets
    sequences, targets = zip(*batch)

    # Pad the sequences
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0)

    # Pad the targets if they are sequences, otherwise just stack them
    if isinstance(targets[0], torch.Tensor) and targets[0].dim() > 0:
        padded_targets = pad_sequence(targets, batch_first=True, padding_value=0)
    else:
        padded_targets = torch.stack(targets)

    return padded_sequences, padded_targets

In [None]:
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
print("Iterating through the dataloader with custom collate_fn:")
for batch, mask in dataloader:
    print("Processed batch shape:", batch.shape)
    print("Mask shape:", mask.shape)
    print("Sample batch:")
    print(batch)
    print("Sample mask:")
    print(mask)
    break

# TODO: Experiment with setting DataLoader with shuffle=False

Iterating through the dataloader with custom collate_fn:


TypeError: 'int' object is not iterable

The TextProcessor now successfully handles variable-length sequences!

## 4. Putting It All Together

Time to combine tokenization, vocabulary creation and data preparation in batches. That's where our `TextProcessor` will help.

In [None]:
class TextProcessor:
    def __init__(self):
        self.vocab = None

    def tokenize(self, text):

        # TODO: Implement tokenization

        pass

    def build_vocab(self, texts):

        # TODO: Build vocabulary from a list of texts

        pass

    def create_dataloader(self, texts, batch_size):

        # TODO: Create a DataLoader with TextDataset from a list of texts

        pass

In [None]:
try:
    check_implementation(TextProcessor)
except NotImplementedError as e:
    print(e)

In [None]:
# Test the TextProcessor
processor = TextProcessor()
processor.build_vocab(sample_dataset)
dataloader = processor.create_dataloader(sample_dataset, batch_size=2)

In [None]:
for batch in dataloader:
    print("Processed batch:", batch)
    break

#### Congratulations! You've implemented a basic text processing pipeline. This will be useful for handling input data in your LLM projects.

## Extra: Reviewing Tokenization Libraries

We'll use `tiktoken`at a later stage for tokenization, so let's see what it does and compare it to another simple tokenization library `NLTK`.

### Using NLTK

In [17]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
nltk_tokens = word_tokenize(sample_text)
print("NLTK Tokens:", nltk_tokens)

NLTK Tokens: ['Hello', ',', 'how', 'are', 'you', 'doing', 'today', '?']


In [21]:
nltk_code_tokens = word_tokenize(code_text)
print("NLTK Tokens for Code:")

NLTK Tokens for Code:


In [23]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [24]:
import tiktoken

### Using Tiktoken

In [25]:
enc = tiktoken.get_encoding("cl100k_base")
tiktoken_tokens = enc.encode(sample_text)
print("Tiktoken Tokens:", tiktoken_tokens)
print("Decoded Tiktoken Tokens:", enc.decode(tiktoken_tokens))

Tiktoken Tokens: [9906, 11, 1268, 527, 499, 3815, 3432, 30]
Decoded Tiktoken Tokens: Hello, how are you doing today?


In [26]:
print(f"NLTK token count: {len(nltk_tokens)}")
print(f"Tiktoken token count: {len(tiktoken_tokens)}")

NLTK token count: 8
Tiktoken token count: 8


In [27]:
tiktoken_code_tokens = enc.encode(code_text)
print("\nTiktoken Tokens (decoded for readability):")
print(enc.decode_tokens_bytes(tiktoken_code_tokens))
print(f"Tiktoken token count: {len(tiktoken_code_tokens)}")


Tiktoken Tokens (decoded for readability):
[b'\n', b'def', b' calculate', b'_ll', b'm', b'_per', b'plex', b'ity', b'(model', b',', b' text', b',', b' max', b'_length', b'=', b'102', b'4', b'):\n', b'   ', b' tokens', b' =', b' tokenizer', b'.encode', b'(text', b',', b' max', b'_length', b'=max', b'_length', b',', b' trunc', b'ation', b'=True', b')\n', b'   ', b' input', b'_ids', b' =', b' torch', b'.tensor', b'([', b'tokens', b']).', b'to', b'(device', b')\n', b'   ', b' with', b' torch', b'.no', b'_grad', b'():\n', b'       ', b' outputs', b' =', b' model', b'(input', b'_ids', b',', b' labels', b'=input', b'_ids', b')\n', b'   ', b' loss', b' =', b' outputs', b'.loss', b'\n', b'   ', b' return', b' math', b'.exp', b'(loss', b'.item', b'())\n\n', b'#', b' Example', b' usage', b'\n', b'per', b'plex', b'ity', b' =', b' calculate', b'_ll', b'm', b'_per', b'plex', b'ity', b'(g', b'pt', b'2', b'_model', b',', b' "', b'Hello', b',', b' world', b'!")\n', b'print', b'(f', b'"', b'Per', b'plex