<a href="https://colab.research.google.com/github/shahabday/BAMline4CT/blob/main/03_PigLatin_and_Collators.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets trl

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading 

## Harvard Sentences

Conditions of use: The material on this site is freely available for use in VoIP testing, research, development, marketing and any other reasonable application. The material may be copied, downloaded, broadcast, modified, incorporated into web sites or test equipment. We do require that you identify the source of the speech materials as "Open Speech Repository"..

https://www.cs.columbia.edu/~hgs/audio/harvard.html

In [None]:
# Downloads harvard_sentences.txt
!gdown 1pg8hJEdhiHjfcrvo3XzuW4d80xzqnFsl

Downloading...
From: https://drive.google.com/uc?id=1pg8hJEdhiHjfcrvo3XzuW4d80xzqnFsl
To: /content/harvard_sentences.txt
  0% 0.00/30.5k [00:00<?, ?B/s]100% 30.5k/30.5k [00:00<00:00, 48.2MB/s]


## Pig Latin

In [None]:
import re
from string import punctuation

def pig_latin(sentence):
    toks = [t.lower() for t in re.findall(r'\w+|[^\s\w]+', sentence) if len(t) > 0]

    def convert(string):
        # if starts with a vowel, just add "ay"
        # else move the consonants to the end and add "ay"
        if string in punctuation:
            return string
        elif string[0].lower() in {'a', 'e', 'i', 'o', 'u'}:
            return ' ' + string + 'way'
        else:
            beginning_consonants = []
            for i in range(len(string)):
                if string[i].lower() in {'a', 'e', 'i', 'o', 'u'}:
                    break
                beginning_consonants.append(string[i])
            return ' ' + string[i:] + ''.join(beginning_consonants) + 'ay'

    return ''.join([convert(t) for t in toks]).strip()

In [None]:
pig_latin('How are you doing today?')

'owhay areway ouyay oingday odaytay?'

### Load Harvard Sentences

In [None]:
from datasets import load_dataset, Split
dataset = load_dataset(path='csv', data_files='harvard_sentences.txt', quotechar='"', split=Split.ALL)
dataset = dataset.shuffle().train_test_split(test_size=0.2)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 576
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 144
    })
})

In [None]:
dataset['train'][0]

{'sentence': 'The birch looked stark white and lonesome.'}

### Translate Sentences to Pig Latin

In [None]:
pig_ds = dataset.map(lambda s: {'translated': pig_latin(s['sentence'])})

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

In [None]:
pig_ds['train'][0]

{'sentence': 'The birch looked stark white and lonesome.',
 'translated': 'ethay irchbay ookedlay arkstay itewhay andway onesomelay.'}

### Prompt Dataset

In [None]:
prompt_pig = pig_ds.rename_columns({'sentence': 'prompt', 'translated': 'completion'}).select_columns(['prompt', 'completion'])

In [None]:
prompt_pig['train'][0]

{'prompt': 'The birch looked stark white and lonesome.',
 'completion': 'ethay irchbay ookedlay arkstay itewhay andway onesomelay.'}

## Data Collators

In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
from torch.utils.data import DataLoader
from trl import DataCollatorForCompletionOnlyLM

base_model_id = 'microsoft/phi-2'

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
)
tokenizer.add_special_tokens({'pad_token': '<|pad|>'})
tokenizer.padding_side = 'left'

In [None]:
tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'pad_token': '<|pad|>'}

### Formatting

Let's build a formatting function that takes both prompt and completion, and inserts a particular string that will be used to trigger the translation. This string is the response template.

In [None]:
response_template = '##[PIGL]##>'
tokenizer.add_special_tokens({'additional_special_tokens': [response_template]})

def formatting_func(example):
    return f'{example["prompt"]}{response_template}{example["completion"]}' + tokenizer.eos_token

formatting_func(prompt_pig['train'][0])

'The birch looked stark white and lonesome.##[PIGL]##>ethay irchbay ookedlay arkstay itewhay andway onesomelay.<|endoftext|>'

In [None]:
max_length = 64

def generate_and_tokenize_prompt(prompt):
    result = tokenizer(
        formatting_func(prompt),
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

dataset = prompt_pig['train'].map(generate_and_tokenize_prompt)
dataset = dataset.remove_columns(['prompt', 'completion'])
print(dataset[0])

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

{'input_ids': [50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 464, 35122, 354, 3114, 19278, 2330, 290, 300, 1952, 462, 13, 50296, 2788, 323, 4173, 354, 24406, 267, 6545, 10724, 610, 74, 31712, 340, 413, 71, 323, 290, 1014, 3392, 296, 417, 323, 13, 50256], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 50295, 464, 35122, 354, 3114, 19278, 2330, 290, 300, 1952, 462, 13, 50296, 2788, 323, 4173, 354, 24406, 267, 6545, 10724, 610, 74, 31712, 340, 413, 71, 323, 290, 1014, 3

### Comparing Collators

In [None]:
response_template

'##[PIGL]##>'

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataloader_lm = DataLoader(dataset, batch_size=4, collate_fn=data_collator)

data_collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
dataloader_completion = DataLoader(dataset, batch_size=4, collate_fn=data_collator)

In [None]:
batch_lm = next(iter(dataloader_lm))
batch_completion = next(iter(dataloader_completion))

In [None]:
(batch_lm['input_ids'] == batch_completion['input_ids']).all()

tensor(True)

In [None]:
(batch_lm['attention_mask'] == batch_completion['attention_mask']).all()

tensor(True)

The only difference is in the labels:

In [None]:
(batch_lm['labels'] == batch_completion['labels']).all()

tensor(False)

In [None]:
batch_lm['labels'][0], tokenizer.decode(batch_lm['labels'][0][batch_lm['labels'][0] >= 0])

(tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,   464,
         35122,   354,  3114, 19278,  2330,   290,   300,  1952,   462,    13,
         50296,  2788,   323,  4173,   354, 24406,   267,  6545, 10724,   610,
            74, 31712,   340,   413,    71,   323,   290,  1014,  3392,   296,
           417,   323,    13, 50256]),
 'The birch looked stark white and lonesome.##[PIGL]##>ethay irchbay ookedlay arkstay itewhay andway onesomelay.<|endoftext|>')

In [None]:
batch_completion['labels'][0], tokenizer.decode(batch_completion['labels'][0][batch_completion['labels'][0] >= 0])

(tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  2788,   323,  4173,   354, 24406,   267,  6545, 10724,   610,
            74, 31712,   340,   413,    71,   323,   290,  1014,  3392,   296,
           417,   323,    13, 50256]),
 'ethay irchbay ookedlay arkstay itewhay andway onesomelay.<|endoftext|>')