In [1]:
!pip install transformers datasets tokenizers
!wget  http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -qq cornell_movie_dialogs_corpus.zip #-qq flag indicate quiet mode where it will suppress all the output except for error message
!rm cornell_movie_dialogs_corpus.zip
!mkdir datasets
!mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
!mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import os
import torch, re, random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import numpy as np, math, itertools
import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.optim import Adam
from pathlib import Path

## Tokenization:


In [3]:
from collections import defaultdict
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')

corpus=[
     "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [7]:
#@ Calculating the frequency of each word:
word_freqs=defaultdict(int)
for text in corpus:
  words_with_offsets=tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
  new_words=[word for word, offset in words_with_offsets]
  print(new_words)
  for word in new_words:
    word_freqs[word]+=1

print(f'\nfinal word freq"{word_freqs}')

['This', 'is', 'the', 'Hugging', 'Face', 'Course', '.']
['This', 'chapter', 'is', 'about', 'tokenization', '.']
['This', 'section', 'shows', 'several', 'tokenizer', 'algorithms', '.']
['Hopefully', ',', 'you', 'will', 'be', 'able', 'to', 'understand', 'how', 'they', 'are', 'trained', 'and', 'generate', 'tokens', '.']

final word freq"defaultdict(<class 'int'>, {'This': 3, 'is': 2, 'the': 1, 'Hugging': 1, 'Face': 1, 'Course': 1, '.': 4, 'chapter': 1, 'about': 1, 'tokenization': 1, 'section': 1, 'shows': 1, 'several': 1, 'tokenizer': 1, 'algorithms': 1, 'Hopefully': 1, ',': 1, 'you': 1, 'will': 1, 'be': 1, 'able': 1, 'to': 1, 'understand': 1, 'how': 1, 'they': 1, 'are': 1, 'trained': 1, 'and': 1, 'generate': 1, 'tokens': 1})


In [9]:
#@ Splitting all the words in alphabet:
alphabet=[]
for word in word_freqs.keys():
  if word[0] not in alphabet:
    alphabet.append(word[0])

  for letter in word[1:]:
    if f"##{letter}" not in alphabet:
      alphabet.append(f'##{letter}')

alphabet.sort()
print(f'All alphabet: {alphabet}')

All alphabet: ['##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']


In [10]:
#@ adding special tokens:
vocab=['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]'] + alphabet.copy()
print(vocab)

['[PAD]', '[CLS]', '[SEP]', '[MASK]', '[UNK]', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##r', '##s', '##t', '##u', '##v', '##w', '##y', '##z', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'g', 'h', 'i', 's', 't', 'u', 'w', 'y']


In [13]:
splits={word: [c if i==0 else f'##{c}' for i,c in enumerate(word)] for word in word_freqs.keys()}
print(f'\n Splitted words: {splits}')


 Splitted words: {'This': ['T', '##h', '##i', '##s'], 'is': ['i', '##s'], 'the': ['t', '##h', '##e'], 'Hugging': ['H', '##u', '##g', '##g', '##i', '##n', '##g'], 'Face': ['F', '##a', '##c', '##e'], 'Course': ['C', '##o', '##u', '##r', '##s', '##e'], '.': ['.'], 'chapter': ['c', '##h', '##a', '##p', '##t', '##e', '##r'], 'about': ['a', '##b', '##o', '##u', '##t'], 'tokenization': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##a', '##t', '##i', '##o', '##n'], 'section': ['s', '##e', '##c', '##t', '##i', '##o', '##n'], 'shows': ['s', '##h', '##o', '##w', '##s'], 'several': ['s', '##e', '##v', '##e', '##r', '##a', '##l'], 'tokenizer': ['t', '##o', '##k', '##e', '##n', '##i', '##z', '##e', '##r'], 'algorithms': ['a', '##l', '##g', '##o', '##r', '##i', '##t', '##h', '##m', '##s'], 'Hopefully': ['H', '##o', '##p', '##e', '##f', '##u', '##l', '##l', '##y'], ',': [','], 'you': ['y', '##o', '##u'], 'will': ['w', '##i', '##l', '##l'], 'be': ['b', '##e'], 'able': ['a', '##b', '##l', '##e'], '

In [15]:
#@ computing scores for merging:

def compute_pair_scores(splits):
  letter_freqs=defaultdict(int)
  pair_freqs=defaultdict(int)

  for word, freq in word_freqs.items():
    split=splits[word]
    if len(split) == 1:
      letter_freqs[split[0]] += freq
      continue

    for i in range(len(split) -1):
      pair=(split[i], split[i+1])
      letter_freqs[split[i]] += freq
      pair_freqs[pair] += freq
    letter_freqs[split[-1]] += freq

  scores= {
      pair: freq/(letter_freqs[pair[0]] * letter_freqs[pair[1]])

      for pair, freq in pair_freqs.items()
  }
  return scores

pair_scores=compute_pair_scores(split)
print(f'Scores for each pair: {pair_scores}')

Scores for each pair: {('T', '##h'): 0.125, ('##h', '##i'): 0.03409090909090909, ('##i', '##s'): 0.02727272727272727, ('i', '##s'): 0.1, ('t', '##h'): 0.03571428571428571, ('##h', '##e'): 0.011904761904761904, ('H', '##u'): 0.1, ('##u', '##g'): 0.05, ('##g', '##g'): 0.0625, ('##g', '##i'): 0.022727272727272728, ('##i', '##n'): 0.01652892561983471, ('##n', '##g'): 0.022727272727272728, ('F', '##a'): 0.14285714285714285, ('##a', '##c'): 0.07142857142857142, ('##c', '##e'): 0.023809523809523808, ('C', '##o'): 0.07692307692307693, ('##o', '##u'): 0.046153846153846156, ('##u', '##r'): 0.022222222222222223, ('##r', '##s'): 0.022222222222222223, ('##s', '##e'): 0.004761904761904762, ('c', '##h'): 0.125, ('##h', '##a'): 0.017857142857142856, ('##a', '##p'): 0.07142857142857142, ('##p', '##t'): 0.07142857142857142, ('##t', '##e'): 0.013605442176870748, ('##e', '##r'): 0.026455026455026454, ('a', '##b'): 0.2, ('##b', '##o'): 0.038461538461538464, ('##u', '##t'): 0.02857142857142857, ('t', '##o')

In [None]:
#@ Finding pair with the best score:
best_pair=''
max_score=None

for pair, score in pair_scores.items():
  if max_score is None or max_score < score:
    best_pair=pair
    max_score=score

print(best_pair, max_score)
vocab.append('ab')

#@ merging the pair:
def merge_pair(a, b, splits):
  for word in word_freqs:
    split=splits[word]
    if len(split)==1:
      continue
    i=0
    while i < len(split)-1:
      if split[i] == a and split[i+1]==b:
        merge= a + b[2:] if b.startswith('##') else a + b
      else:
        i+= 1
    splits[word] = split
  return splits

splits=merge_pair('a', '##b', splits)
print(splits['about'])


('a', '##b') 0.2


## Pre-processing:


In [None]:
class BERTDataset(Dataset):
  def __init__(self, data_pair, tokenizer, seq_len=64):
    self.tokenizer=tokenizer
    self.seq_len=seq_len
    self.corpus_lines=len(data_pair)
    self.lines=data_pair

  def __len__(self):
    return self.corpus_lines

  def __getitem__(self, item):
    #getting random sentence pair(saved as is_next_label)
    t1, t2, is_next_label=self.get_sent(item)

    #replacing random words in sentence with mask/random words:
    t1_random, t1_label=self.random_word(t1)
    t2_random, t2_label=self.random_word(t2)

    #adding special tokens:
    t1=[self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
    t2=t2_random + [self.tokenizer.vocab['[SEP]']]
    t1_label=[self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
    t2_label +=[self.tokenizer.vocab['[PAD]']]

    #combining sentence one and two as one input and adding PAD tokens
    #to make sentence same length as seq_len
    segment_label=([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
    bert_input=(t1+t2)[:self.seq_len]
    bert_label=(t1_label + t2_label)[:self.seq_len]
    padding=[self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
    bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

    output={
        "bert_input": bert_input,
        "bert_label":bert_label,
        "segment_label": segment_label,
        "is_next": is_next_label
    }

    return {key: torch.tensor(value) for key, value in output.items()}

  def random_word(self, sentence):
    tokens=sentence.split()
    output=[] #stores modified tokens id
    output_label=[] # stores original token id

    # according to the bert paper, 15% of the tokens are generally replaced:
    for  i, token in enumerate(tokens):
      prob=random.random() #generate random prob between 0 to 1

      #removing special tokens: [CLS] and [SEP]
      token_id=self.tokenizer(token)['input_ids'][1:-1]

      # 15% token position at random:
      if prob < 0.15:
        prob /= 0.15

        # 80% masking in the selected position:
        if prob < .8:
          for i in range(len(token_id)):
            output.append(self.tokenizer.vocab['[MASK]'])

        # 10% change to random token:
        elif prob < 0.9:
          for i in range(len(token_id)):
            output.append(random.randrange(len(self.tokenizer.vocab)))

        # 10% of the current token:
        else:
          output.append(token_id)

        output_label.append(token_id)

      else:
        output.append(token_id)
        for i in range(len(token_id)):
          output_label.append(0)

    # flattening is done:
    output=list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
    output_label=list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
    assert len(output) == len(output_label)
    return output, output_label

  def get_sent(self, index):
    """return random sentence pair"""
    t1, t2=self.get_corpus_line(index)

    #for next sentence prediction:
    if random.random() > 0.5:
      return t1, t2, 1
    else:
      return t1, self.get_random_line(), 0


  def get_corpus_line(self, item):
    """return sentence pair"""
    return self.lines[item][0], self.lines[item][1]

  def get_random_line(self):
    """return random single sentence"""
    return self.lines[random.randrange(len(self.lines))][1]
