# Byte Pair Tokenizer 
*just getting a hang of it!*

In [2]:
import requests
import re
from collections import defaultdict, Counter
import torch

### Functions

In [3]:
# Step 1: Download a small corpus
def download_corpus(url):
    response = requests.get(url)
    return response.text
# Step 2: Implement the BPE algorithm
def get_vocab(corpus):
    vocab = defaultdict(int)
    for word in corpus.split():
        word = ' '.join(list(word)) + ' </w>'
        vocab[word] += 1
    return vocab

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in vocab:
        w_out = p.sub(''.join(pair), word)
        new_vocab[w_out] = vocab[word]
    return new_vocab

def bpe(corpus, num_merges):
    vocab = get_vocab(corpus)
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    return vocab

# Step 3: Tokenize the corpus using the BPE tokenizer
def tokenize(corpus, vocab):
    tokens = []
    for word in corpus.split():
        word = ' '.join(list(word)) + ' </w>'
        for pair in vocab:
            if pair in word:
                word = word.replace(pair, ''.join(pair.split()))
        tokens.extend(word.split())
    return tokens

# Step 4: Calculate and display the total tokens in the vocabulary and the compression ratio
def calculate_compression_ratio(original_corpus, tokenized_corpus):
    original_length = len(original_corpus.split())
    tokenized_length = len(tokenized_corpus)
    return original_length / tokenized_length

### Execute

In [4]:
# Download a small corpus
# url = "https://www.gutenberg.org/files/11/11-0.txt"  # Alice's Adventures in Wonderland
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
corpus = download_corpus(url)
print(f'Corpus length: {len(corpus)} characters')
print('\nFirst 100 characters:\n', corpus[:100])

Corpus length: 1115394 characters

First 100 characters:
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
# Implement BPE
num_merges = 100
vocab = bpe(corpus, num_merges)

# Tokenize the corpus
tokenized_corpus = tokenize(corpus, vocab)

# Calculate and display the total tokens in the vocabulary and the compression ratio
total_tokens = len(set(tokenized_corpus))
compression_ratio = calculate_compression_ratio(corpus, tokenized_corpus)

print(f"Total tokens in vocabulary: {total_tokens}")
print(f"Compression ratio: {compression_ratio:.2f}")


Total tokens in vocabulary: 404
Compression ratio: 0.19
