In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


# Finetuning GPT on Simpsons Episodes

This is an exercise in using Huggingface to finetune GPT.

In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Using pad_token, but it is not set yet.


In [5]:
thai_data = TextDataset(
    tokenizer=tokenizer,
    file_path='thai_romanization_gpt.txt',  # dzongkha text summaries
    block_size=32  # length of each chunk of text to use as a datapoint
)



In [6]:
thai_data[0], thai_data[0].shape

(tensor([ 4221, 20185,    25,   220, 19567,   223, 19567,   255, 19567,   229,
         19567,   252, 19567,   109, 19567,   247, 19567,   245, 19567,   104,
         19567,   110, 19567,    96, 19567,   249, 19567,   115, 19567,   247,
         31479,   225]), torch.Size([32]))

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [8]:
collator_example = data_collator([tokenizer('บ้านเทพพยัคฆ์ใต้'), tokenizer('บ้านเทพพยัคฆ์ใต้')])

collator_example

{'input_ids': tensor([[19567,   248, 31479,   231, 19567,   110, 19567,   247, 31479,   222,
         19567,   245, 19567,   252, 19567,   252, 19567,    95, 19567,   109,
         19567,   226, 19567,   228, 31479,   234, 31479,   225, 19567,   243,
         31479,   231],
        [19567,   248, 31479,   231, 19567,   110, 19567,   247, 31479,   222,
         19567,   245, 19567,   252, 19567,   252, 19567,    95, 19567,   109,
         19567,   226, 19567,   228, 31479,   234, 31479,   225, 19567,   243,
         31479,   231]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[19567,   248, 31479,   231, 19567,   110, 19567,   247, 31479,   222,
         19567,   245, 19567,   252, 19567,   252, 19567,    95, 19567,   109,
         19567,   226, 19567,   228, 31479, 

In [9]:
model_checkpoint = 'gpt2'

model = GPT2LMHeadModel.from_pretrained(model_checkpoint)  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="caffsean/gpt2-thai-romanization", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=10, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(thai_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    hub_token='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',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=thai_data.examples[:int(len(thai_data.examples)*.8)],
    eval_dataset=thai_data.examples[int(len(thai_data.examples)*.8):]
)

trainer.evaluate()

Cloning https://huggingface.co/caffsean/gpt2-thai-romanization into local empty directory.
***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32


{'eval_loss': 3.753413677215576,
 'eval_runtime': 63.3027,
 'eval_samples_per_second': 375.055,
 'eval_steps_per_second': 11.721}

In [11]:
trainer.train()

***** Running training *****
  Num examples = 94964
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 29680
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,1.4961,1.416093
2,1.3384,1.279297
3,1.2067,1.169361


***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32


Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-2968
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-2968/config.json
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-2968/generation_config.json
Model weights saved in caffsean/gpt2-thai-romanization/checkpoint-2968/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-5936
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-5936/config.json
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-5936/generation_config.json
Model weights saved in caffsean/gpt2-thai-romanization/checkpoint-5936/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-8904
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-8904/config.json
Configuration sa

Epoch,Training Loss,Validation Loss
1,1.4961,1.416093
2,1.3384,1.279297
3,1.2067,1.169361
4,1.1307,1.098709
5,1.0965,1.053073
6,1.045,1.02333
7,1.0179,1.002129
8,0.9902,0.982626
9,0.9405,0.966046
10,0.9145,0.959653


***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-11872
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-11872/config.json
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-11872/generation_config.json
Model weights saved in caffsean/gpt2-thai-romanization/checkpoint-11872/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-14840
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-14840/config.json
Configuration saved in caffsean/gpt2-thai-romanization/checkpoint-14840/generation_config.json
Model weights saved in caffsean/gpt2-thai-romanization/checkpoint-14840/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 23742
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-thai-romanization/checkpoint-17808
Configuration save

TrainOutput(global_step=29680, training_loss=1.1802517348543975, metrics={'train_runtime': 9850.2363, 'train_samples_per_second': 96.408, 'train_steps_per_second': 3.013, 'total_flos': 1.550833532928e+16, 'train_loss': 1.1802517348543975, 'epoch': 10.0})

In [12]:
trainer.push_to_hub()

Saving model checkpoint to caffsean/gpt2-thai-romanization
Configuration saved in caffsean/gpt2-thai-romanization/config.json
Configuration saved in caffsean/gpt2-thai-romanization/generation_config.json
Model weights saved in caffsean/gpt2-thai-romanization/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

Upload file runs/Feb20_01-44-17_8c45581dedc8/events.out.tfevents.1676857532.8c45581dedc8.106.0:  32%|###2     …

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-thai-romanization
   4968aa0..fb35ed4  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-thai-romanization
   4968aa0..fb35ed4  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/caffsean/gpt2-thai-romanization
   fb35ed4..f4c2cd1  main -> main

   fb35ed4..f4c2cd1  main -> main



'https://huggingface.co/caffsean/gpt2-thai-romanization/commit/fb35ed4f0c30bebc01df7d6d04d3dc179efbe5de'

In [None]:
loaded_model = GPT2LMHeadModel.from_pretrained('caffsean/gpt2-thai-romanization')

In [14]:
finetuned_generator = pipeline(
            'text-generation', model=loaded_model, tokenizer=tokenizer, return_full_text=False, max_length=400,do_sample=True, top_p= 0.9, temperature=0.6, repetition_penalty=0.1, top_k=50
          )

In [15]:
def romanize(generator,word):
  print(f'romanizing: {word}')
  prompt = f'THAI: {word}\n\nROMANIZED:'
  options = generator(prompt,num_return_sequences=1)

  print(options[0]['generated_text'])

In [None]:
romanize(finetuned_generator,'บ้านหนองเลา')