In [1]:
!pip install transformers datasets tokenizers
!wget  http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip
!unzip -qq cornell_movie_dialogs_corpus.zip #-qq flag indicate quiet mode where it will suppress all the output except for error message
!rm cornell_movie_dialogs_corpus.zip
!mkdir datasets
!mv cornell\ movie-dialogs\ corpus/movie_conversations.txt ./datasets
!mv cornell\ movie-dialogs\ corpus/movie_lines.txt ./datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import os
import torch, re, random
import transformers, datasets
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer
import numpy as np, math, itertools
import tqdm
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.optim import Adam
from pathlib import Path

## Tokenization:


## Pre-processing:


In [3]:
class BERTDataset(Dataset):
  def __init__(self, data_pair, tokenizer, seq_len=64):
    self.tokenizer=tokenizer
    self.seq_len=seq_len
    self.corpus_lines=len(data_pair)
    self.lines=data_pair

  def __len__(self):
    return self.corpus_lines

  def __getitem__(self, item):
    #getting random sentence pair(saved as is_next_label)
    t1, t2, is_next_label=self.get_sent(item)

    #replacing random words in sentence with mask/random words:
    t1_random, t1_label=self.random_word(t1)
    t2_random, t2_label=self.random_word(t2)

    #adding special tokens:
    t1=[self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
    t2=t2_random + [self.tokenizer.vocab['[SEP]']]
    t1_label=[self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
    t2_label +=[self.tokenizer.vocab['[PAD]']]

    #combining sentence one and two as one input and adding PAD tokens
    #to make sentence same length as seq_len
    segment_label=([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
    bert_input=(t1+t2)[:self.seq_len]
    bert_label=(t1_label + t2_label)[:self.seq_len]
    padding=[self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
    bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

    output={
        "bert_input": bert_input,
        "bert_label":bert_label,
        "segment_label": segment_label,
        "is_next": is_next_label
    }

    return {key: torch.tensor(value) for key, value in output.items()}

  def random_word(self, sentence):
    tokens=sentence.split()
    output=[] #stores modified tokens id
    output_label=[] # stores original token id

    # according to the bert paper, 15% of the tokens are generally replaced:
    for  i, token in enumerate(tokens):
      prob=random.random() #generate random prob between 0 to 1

      #removing special tokens: [CLS] and [SEP]
      token_id=self.tokenizer(token)['input_ids'][1:-1]

      # 15% token position at random:
      if prob < 0.15:
        prob /= 0.15

        # 80% masking in the selected position:
        if prob < .8:
          for i in range(len(token_id)):
            output.append(self.tokenizer.vocab['[MASK]'])

        # 10% change to random token:
        elif prob < 0.9:
          for i in range(len(token_id)):
            output.append(random.randrange(len(self.tokenizer.vocab)))

        # 10% of the current token:
        else:
          output.append(token_id)

        output_label.append(token_id)

      else:
        output.append(token_id)
        for i in range(len(token_id)):
          output_label.append(0)

    # flattening is done:
    output=list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
    output_label=list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
    assert len(output) == len(output_label)
    return output, output_label

  def get_sent(self, index):
    """return random sentence pair"""
    t1, t2=self.get_corpus_line(index)

    #for next sentence prediction:
    if random.random() > 0.5:
      return t1, t2, 1
    else:
      return t1, self.get_random_line(), 0


  def get_corpus_line(self, item):
    """return sentence pair"""
    return self.lines[item][0], self.lines[item][1]

  def get_random_line(self):
    """return random single sentence"""
    return self.lines[random.randrange(len(self.lines))][1]
