<a href="https://colab.research.google.com/github/raz0208/Techniques-For-Text-Analysis/blob/main/BPE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
from collections import defaultdict

In [15]:
def get_stats(vocab):
    """
    Given a vocabulary (dictionary mapping words to frequency counts), returns a
    dictionary of tuples representing the frequency count of pairs of characters
    in the vocabulary.
    """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i],symbols[i+1]] += freq
    return pairs

In [16]:
def merge_vocab(pair, v_in):
    """
    Given a pair of characters and a vocabulary, returns a new vocabulary with the
    pair of characters merged together wherever they appear.
    """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

In [17]:
def get_vocab(data):
    """
    Given a list of strings, returns a dictionary of words mapping to their frequency
    count in the data.
    """
    vocab = defaultdict(int)
    for line in data:
        for word in line.split():
            vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

In [18]:
def byte_pair_encoding(data, n):
    """
    Given a list of strings and an integer n, returns a list of n merged pairs
    of characters found in the vocabulary of the input data.
    """
    vocab = get_vocab(data)
    for i in range(n):
        pairs = get_stats(vocab)
        best = max(pairs, key=pairs.get)
        vocab = merge_vocab(best, vocab)
    return vocab

In [19]:
# Example usage:
corpus = '''Tokenization is the process of breaking down
a sequence of text into smaller units called tokens,
which can be words, phrases, or even individual characters.
Tokenization is often the first step in natural languages processing tasks
such as text classification, named entity recognition, and sentiment analysis.
The resulting tokens are typically used as input to further processing steps,
such as vectorization, where the tokens are converted
into numerical representations for machine learning models to use.'''
data = corpus.split('.')

n = 230
bpe_pairs = byte_pair_encoding(data, n)
bpe_pairs

{'Tokenization</w>': 2,
 'is</w>': 2,
 'the</w>': 3,
 'process</w>': 1,
 'of</w>': 2,
 'breaking</w>': 1,
 'down</w>': 1,
 'a</w>': 1,
 'sequence</w>': 1,
 'text</w>': 2,
 'into</w>': 2,
 'smaller</w>': 1,
 'units</w>': 1,
 'called</w>': 1,
 'tokens,</w>': 1,
 'which</w>': 1,
 'can</w>': 1,
 'be</w>': 1,
 'words,</w>': 1,
 'phrases,</w>': 1,
 'or</w>': 1,
 'even</w>': 1,
 'individual</w>': 1,
 'characters</w>': 1,
 'often</w>': 1,
 'first</w>': 1,
 'step</w>': 1,
 'in</w>': 1,
 'natural</w>': 1,
 'languages</w>': 1,
 'processing</w>': 2,
 'tasks</w>': 1,
 'such</w>': 2,
 'as</w>': 3,
 'classification,</w>': 1,
 'named</w>': 1,
 'entity</w>': 1,
 'recognition,</w>': 1,
 'and</w>': 1,
 'sentiment</w>': 1,
 'analysis</w>': 1,
 'The</w>': 1,
 'resulting</w>': 1,
 'tokens</w>': 2,
 'are</w>': 2,
 'typically</w>': 1,
 'used</w>': 1,
 'input</w>': 1,
 'to</w>': 2,
 'further</w>': 1,
 'steps,</w>': 1,
 'vectorization,</w>': 1,
 'where</w>': 1,
 'converted</w>': 1,
 'numerical</w>': 1,
 'repres