In [1]:
!pip install accelerate -U
!pip install transformers[torch]



In [2]:
import os
from ipywidgets import Dropdown
import pandas as pd
import numpy as np
import re
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [3]:
#get a list of the translations available for training.
trained_model_folder = "./out"
language_pair_staging_folder = "../data/magic_token_folder/"

In [21]:
if os.path.exists(trained_model_folder) and os.path.isdir(trained_model_folder):
    # Get a list of all subfolders in language_pair_staging_folder
    subfolders = [folder for folder in os.listdir(trained_model_folder) if os.path.isdir(os.path.join(trained_model_folder, folder))]

    # Filter subfolders that end with "_model"
    model_folders = [folder for folder in subfolders if folder.endswith("_model") or folder.endswith( "_model_step" ) ]

    # Print or use the list of model folders
    print("Folders ending with '_model' or '_step':", model_folders)
else:
    print(f"The folder '{trained_model_folder}' does not exist or is not a directory.")

Folders ending with '_model': ['macula_model', 'bsb_model_step', 'greek_model', 'greek_model_step', 'bsb_model', 'target_model', 'hebrew_model']


In [22]:
selected_model_dropdown = Dropdown(options=model_folders)
print( "Select which model to train" )
display(selected_model_dropdown)

Select which model to train


Dropdown(options=('macula_model', 'bsb_model_step', 'greek_model', 'greek_model_step', 'bsb_model', 'target_mo…

In [50]:
#training code copied from
#https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)
      
  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          save_total_limit=4,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )
      
  trainer.train()
  trainer.save_model()

In [51]:
selected_model_dropdown.value
target_model = selected_model_dropdown.value.replace( "_model", "" ).replace( "_step", "" )
target_model

'target'

In [52]:
# you need to set parameters 
train_file_path = os.path.join( language_pair_staging_folder, f"train_{target_model}.txt" )
model_name = os.path.join( trained_model_folder, selected_model_dropdown.value )
output_dir = os.path.join( trained_model_folder, f"{target_model}_model_step" )
overwrite_output_dir = True
per_device_train_batch_size = 2 #8
num_train_epochs = 5.0
save_steps = 500

In [53]:
output_dir

'./out/target_model_step'

In [54]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,5.1987
1000,4.0856
1500,3.9049
2000,3.6841
2500,3.6193
3000,3.5731
3500,3.4501
4000,3.4268
4500,3.3947
5000,3.3396
