In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


# Finetuning GPT on Simpsons Episodes

This is an exercise in using Huggingface to finetune GPT.

In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Using pad_token, but it is not set yet.


In [6]:
dzongkha_data = TextDataset(
    tokenizer=tokenizer,
    file_path='dzongkha.txt',  # dzongkha text summaries
    block_size=32  # length of each chunk of text to use as a datapoint
)

In [7]:
dzongkha_data[0], dzongkha_data[0].shape

(tensor([   35,    57, 18494,    42,  7801,    25,   220,   156,   121,   222,
           156,   121,   251, 41340,   233,   198,   198, 33676,  1565, 14887,
          1961,    25,   479,   559,   628,   628,   198,   198,    35,    57,
         18494,    42]), torch.Size([32]))

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [9]:
collator_example = data_collator([tokenizer('ཀུན་གསལ་'), tokenizer('ཀུན་གསལ་')])

collator_example

{'input_ids': tensor([[  156,   121,   222,   156,   121,   112,   156,   121,   241, 41340,
           233,   156,   121,   224,   156,   121,    99,   156,   121,    96,
         41340,   233],
        [  156,   121,   222,   156,   121,   112,   156,   121,   241, 41340,
           233,   156,   121,   224,   156,   121,    99,   156,   121,    96,
         41340,   233]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  156,   121,   222,   156,   121,   112,   156,   121,   241, 41340,
           233,   156,   121,   224,   156,   121,    99,   156,   121,    96,
         41340,   233],
        [  156,   121,   222,   156,   121,   112,   156,   121,   241, 41340,
           233,   156,   121,   224,   156,   121,    99,   156,   121,    96,
         41340,   233]])}

In [10]:
model_checkpoint = 'gpt2'

model = GPT2LMHeadModel.from_pretrained(model_checkpoint)  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir="caffsean/gpt2-dzongkha-romanization", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(dzongkha_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    hub_token='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',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dzongkha_data.examples[:int(len(dzongkha_data.examples)*.8)],
    eval_dataset=dzongkha_data.examples[int(len(dzongkha_data.examples)*.8):]
)

trainer.evaluate()

Cloning https://huggingface.co/caffsean/gpt2-dzongkha-romanization into local empty directory.
***** Running Evaluation *****
  Num examples = 333
  Batch size = 32


{'eval_loss': 3.969970226287842,
 'eval_runtime': 3.7448,
 'eval_samples_per_second': 88.923,
 'eval_steps_per_second': 2.937}

In [12]:
trainer.train()

***** Running training *****
  Num examples = 1329
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 420
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,No log,2.785023
2,3.694000,1.278262
3,1.784000,1.051425
4,1.166000,0.979681
5,1.055900,0.958167
6,0.976500,0.9157
7,0.976500,0.885065
8,0.933200,0.875115
9,0.879000,0.870415
10,0.834000,0.864399


***** Running Evaluation *****
  Num examples = 333
  Batch size = 32


Saving model checkpoint to caffsean/gpt2-dzongkha-romanization/checkpoint-42
Configuration saved in caffsean/gpt2-dzongkha-romanization/checkpoint-42/config.json
Configuration saved in caffsean/gpt2-dzongkha-romanization/checkpoint-42/generation_config.json
Model weights saved in caffsean/gpt2-dzongkha-romanization/checkpoint-42/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 333
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-dzongkha-romanization/checkpoint-84
Configuration saved in caffsean/gpt2-dzongkha-romanization/checkpoint-84/config.json
Configuration saved in caffsean/gpt2-dzongkha-romanization/checkpoint-84/generation_config.json
Model weights saved in caffsean/gpt2-dzongkha-romanization/checkpoint-84/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 333
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-dzongkha-romanization/checkpoint-126
Configuration saved in caffsean/gpt2-dzongkha-romanization/checkpoint-126/config.jso

TrainOutput(global_step=420, training_loss=1.386727946145194, metrics={'train_runtime': 296.8375, 'train_samples_per_second': 44.772, 'train_steps_per_second': 1.415, 'total_flos': 217035694080000.0, 'train_loss': 1.386727946145194, 'epoch': 10.0})

In [13]:
trainer.push_to_hub()

Saving model checkpoint to caffsean/gpt2-dzongkha-romanization
Configuration saved in caffsean/gpt2-dzongkha-romanization/config.json
Configuration saved in caffsean/gpt2-dzongkha-romanization/generation_config.json
Model weights saved in caffsean/gpt2-dzongkha-romanization/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

Upload file runs/Feb19_14-44-18_de4395aeab6d/events.out.tfevents.1676817875.de4395aeab6d.278.0: 100%|#########…

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-dzongkha-romanization
   c5182b5..333df8c  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-dzongkha-romanization
   c5182b5..333df8c  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/caffsean/gpt2-dzongkha-romanization
   333df8c..e337e86  main -> main

   333df8c..e337e86  main -> main



'https://huggingface.co/caffsean/gpt2-dzongkha-romanization/commit/333df8c73f90b4af581a6f5e00a7779d1da828a2'

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('caffsean/gpt2-dzongkha-romanization')

In [34]:
finetuned_generator = pipeline(
            'text-generation', model=loaded_model, tokenizer=tokenizer, return_full_text=False, max_length=400,do_sample=True, top_p= 0.9, temperature=0.6, repetition_penalty=0.1, top_k=50
          )

In [37]:
def romanize(generator,word):
  print(f'romanizing: {word}')
  prompt = f'DZONGKHA: {word}\n\nROMANIZED:'
  options = generator(prompt,num_return_sequences=1)

  print(options[0]['generated_text'])

In [38]:
romanize(finetuned_generator,'ཀུན་གསལ་')

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.26.1"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


romanizing: ཀུན་གསལ་

: ན�ུ�༽ED
:: ���D�Z:

ZHA
་DZONGKHA: ་ན
ZONGKHAHAEDZHAཀ: ས


ROMANIZED:
 ࣋ ུཋ�
 ནDDAN་

DZONGKHA: ഋུ༓�
: �EDEDZHA་�ഀ
DZONGONGHADZONGAN༽�: �ED

ROM: ണ�

ༀ�
Z༽�K༣ནཀ � IZROM���
::DDAN ༂�༦


ED:K: ༦��K� ��༴་�ONG ལEDIZ

DZONGKHAࣦROMDANKONGHA: ༀONG
EDK༽�ུནༀ

་

ANKKHA:

ONGKHAག�ཽ�D ��
KHADZED � �༓ུDDHAZONGKHAས༓Z ནIZ�DZ���� ��
ROMANIZED���ROM

་��

༽
DZONGONGKHAK་ROM 
