In [None]:
import numpy as np
import re
import itertools
import nltk
from nltk.corpus import brown, stopwords

nltk.download('brown')
stopwords = stopwords.words('english')

### Tokenizers
Tokenizers can be of different methods:
1. **Word Tokenization:**
    1. **Space Tokenization:** Breaking down a sentence into tokens based on white space between words.
    2. **Punctuation Tokenization:** Tokenizing punctuations separately so that there aren't extra tokens for each combination of {word}_{punc}.
    3. **Rule-based tokenization:** Tokenizing the word **Don't** can have an issue with Punctuation Tokenization, so adding extra rules.
2. **Character Tokenization:**
    1. Tokenizing each character and punctuation, small vocabulary size.
    2. Trouble with context independent semantic representation. For example - representation of 'c' v/s representation of word 'car'
3. **Subword-Tokenization:**
    1. Best of both above - lesser vocabulary size, context independent semantic representations
    2. Methods:
        1. **BPE:** Byte-Pair Encoding, 
        2. **Byte-Level BPE:** Used in GPT
        3. **Word-Piece:** Used in BERT
        4. **Sentence-Piece:** Used in XLNet, T5

### Byte Pair Encoding (BPE)

#### Wikipedia Example - Character Level

In [2]:
def find_most_occuring_pair(string, pairs):
    max_count = 0
    best_pair = None
    for pair in pairs:
        count = 0
        for char_idx in range(len(string)):
            if pair == string[char_idx:char_idx+2]:
                count += 1
        if count > max_count:
            max_count = count
            best_pair = pair
    return best_pair


def iteration(string, replacer):
    vocab = list(set(string))

    pairs1 = list(itertools.permutations(vocab, 2))
    pairs2 = list(itertools.combinations_with_replacement(vocab, 2))
    pairs = set(pairs1 + pairs2)
    
    pairs = list(map(lambda x: ''.join(x), pairs))
    best_pair = find_most_occuring_pair(string, pairs)

    if best_pair is not None:
        string = string.replace(best_pair, replacer)
    return string, best_pair
    

def run_BPE(input_string, replacers):
    string = input_string
    best_pair = input_string[:2]
    vocab = {}
    idx = 0 

    while True:
        string, best_pair = iteration(string, replacers[idx])
        if best_pair is not None:
            vocab[replacers[idx]] = best_pair
            print('Best Pair:', best_pair, '| String:', string)
            idx += 1
        else:
            return vocab

In [3]:
string = 'aaabdaaabac'
replacers = [chr(65 + i) for i in range(26)]
replacers.reverse()

print('Original String:', string, '\n')
run_BPE(string, replacers)

Original String: aaabdaaabac 

Best Pair: aa | String: ZabdZabac
Best Pair: Za | String: YbdYbac
Best Pair: Yb | String: XdXac
Best Pair: dX | String: XWac
Best Pair: Wa | String: XVc
Best Pair: XV | String: Uc
Best Pair: Uc | String: T


{'Z': 'aa', 'Y': 'Za', 'X': 'Yb', 'W': 'dX', 'V': 'Wa', 'U': 'XV', 'T': 'Uc'}

#### Hugging Face example - Word level

In [4]:
def find_best_pair(corpus, pairs):
    max_count = 0
    best_pair = None

    for pair in pairs:
        count = 0    
        for idx in range(len(corpus)):
            if pair in corpus[idx,0]:
                count += int(corpus[idx,1])
        if count > max_count:
            max_count = count
            best_pair = pair
            
    return max_count, best_pair


def run_iteration(corpus):
    chars = list(map(lambda x: list(x), corpus[:, 0]))
    base_vocab = list(set(itertools.chain(*chars)))

    pairs1 = list(itertools.permutations(base_vocab, 2))
    pairs2 = list(itertools.combinations_with_replacement(base_vocab, 2))
    pairs = set(pairs1 + pairs2)

    pairs = list(map(lambda x: ''.join(x), pairs))

    max_count, best_pair = find_best_pair(corpus, pairs)
    return best_pair, len(base_vocab)


def run_BPE(base_corpus, replacers, max_vocabulary_size):
    corpus = base_corpus 
    idx = 0
    vocabulary_size = float('inf')
    while vocabulary_size >= max_vocabulary_size:
        replacer = replacers[idx]
        best_pair, vocabulary_size = run_iteration(corpus) 
        
        if best_pair is not None:
            corpus[:,0] = list(map(lambda x: x.replace(best_pair, replacer), corpus[:, 0]))
            
            print_corpus = list(map(lambda x: list([x[0], int(x[1])]), corpus))
            print('Best Pair:', best_pair, '| Vocabulary Size:', vocabulary_size, '| Corpus:', print_corpus)
            idx += 1
        else:
            return 

In [5]:
max_vocabulary_size = 6
base_corpus = [('hug', 10), ('pug', 5), ('pun', 12), ('bun', 4), ('hugs', 5)]
print('Base Corpus:', base_corpus, '\n')

base_corpus = np.array(base_corpus)
run_BPE(base_corpus, replacers, max_vocabulary_size)

Base Corpus: [('hug', 10), ('pug', 5), ('pun', 12), ('bun', 4), ('hugs', 5)] 

Best Pair: ug | Vocabulary Size: 7 | Corpus: [['hZ', 10], ['pZ', 5], ['pun', 12], ['bun', 4], ['hZs', 5]]
Best Pair: un | Vocabulary Size: 7 | Corpus: [['hZ', 10], ['pZ', 5], ['pY', 12], ['bY', 4], ['hZs', 5]]
Best Pair: hZ | Vocabulary Size: 6 | Corpus: [['X', 10], ['pZ', 5], ['pY', 12], ['bY', 4], ['Xs', 5]]
Best Pair: pY | Vocabulary Size: 6 | Corpus: [['X', 10], ['pZ', 5], ['W', 12], ['bY', 4], ['Xs', 5]]
Best Pair: pZ | Vocabulary Size: 7 | Corpus: [['X', 10], ['V', 5], ['W', 12], ['bY', 4], ['Xs', 5]]
Best Pair: Xs | Vocabulary Size: 6 | Corpus: [['X', 10], ['V', 5], ['W', 12], ['bY', 4], ['U', 5]]
Best Pair: bY | Vocabulary Size: 6 | Corpus: [['X', 10], ['V', 5], ['W', 12], ['T', 4], ['U', 5]]


### Word Piece Tokenizer

Word Piece Tokenizer is sub-word tokenization scheme, similar to BPE and is used by BERT. We will briefly look into pre-trained word embeddings from BERT model, and how to prepare data for training.

In [6]:
from transformers import AutoTokenizer
from transformers import BertModel

bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
embedding_size = bert.config.to_dict()['hidden_size']
vocab_size = bert.config.to_dict()['vocab_size']
embedding_size, vocab_size

(768, 30522)

In [8]:
vocab_size = tokenizer.vocab_size
max_length = tokenizer.model_max_length
token2int = tokenizer.vocab
int2token = {v:k for k, v in token2int.items()}
vocab_size, max_length

(30522, 512)

In [9]:
print('<BOS>:', tokenizer.bos_token_id)
print('<EOS>:', tokenizer.eos_token_id)
print('<PAD>:', tokenizer.pad_token_id)
print('<UNK>:', tokenizer.unk_token_id)
print('<CLS>:', tokenizer.cls_token_id)
print('<SEP>:', tokenizer.sep_token_id)
print('<MASK>:', tokenizer.mask_token_id)

<BOS>: None
<EOS>: None
<PAD>: 0
<UNK>: 100
<CLS>: 101
<SEP>: 102
<MASK>: 103


In [10]:
sents = ['The weather is nice today.', 'I am trying to learn different kinds of Tokenizers from Hugging Face.']

# PyTorch tensors
encoded_input = tokenizer(sents, padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  1996,  4633,  2003,  3835,  2651,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0],
        [  101,  1045,  2572,  2667,  2000,  4553,  2367,  7957,  1997, 19204,
         17629,  2015,  2013, 17662,  2227,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
for idx in range(len(sents)):
    decoded_sent = tokenizer.decode(encoded_input['input_ids'][idx])
    print('Decoded Sentence:', decoded_sent)

Decoded Sentence: [CLS] the weather is nice today. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
Decoded Sentence: [CLS] i am trying to learn different kinds of tokenizers from hugging face. [SEP]


In [12]:
# sentence tokenization
print(sents[1].lower().split(' '))

# word piece tokenization
print(tokenizer.tokenize(sents[1]))

['i', 'am', 'trying', 'to', 'learn', 'different', 'kinds', 'of', 'tokenizers', 'from', 'hugging', 'face.']
['i', 'am', 'trying', 'to', 'learn', 'different', 'kinds', 'of', 'token', '##izer', '##s', 'from', 'hugging', 'face', '.']


### BERT Embeddings

BERT is Bi-Directional Encoder Representations from Transformers. It's trained on two tasks: 
1. Masked Language Modelling: Predicting masked tokens in sentence from non-masked ones
2. Sentence Classification: Given two sentences A and B, classify whether A follows B or not


The output of BERT layer is:
1. **Sequence output:** 
    1. The hidden state of last layer in stacked model, capturing the context of whole sentence. 
    2. The shape of this output is: [num_examples, max_length, embedding_size]
2. **Pooled output:** 
    1. This can be said as the embedding of [CLS] token
    2. The shape of this output is: [num_examples, embedding_size]


For tasks like Sentiment Analysis, we can do the following:
1. Embedding of [CLS] token from sequence output i.e. 1st embedding vector for each sentence can be used as sentence summary
2. Embedding of [CLS] token from pooled output, this is better representation of sentence embedding, and is typically followed for sentence classification
3. Mean pooling (Averaging) of embeddings of words in a sentence

In [13]:
output = bert(encoded_input['input_ids'])
print('Sequence output shape:', output['last_hidden_state'].shape)
print('Pooled output shape:', output['pooler_output'].shape)

Sequence output shape: torch.Size([2, 17, 768])
Pooled output shape: torch.Size([2, 768])
