In [8]:
!pip install accelerate -U
!pip install transformers[torch]



In [21]:
import os, pickle, torch
from ipywidgets import Dropdown
import pandas as pd
import numpy as np
import re
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import torch.nn as nn

In [7]:
#get a list of the translations available for training.
trained_model_folder = "./out"
language_pair_staging_folder = "../data/magic_token_folder/"
use_side_frozen_tokens = True
if use_side_frozen_tokens:
    num_magic_tokens = 1
    trained_embeddings_pickle_file = os.path.join( language_pair_staging_folder, f"magic_tokens_size_{num_magic_tokens}.pickle" )

 - Primary models end with _model.
 - They are trained and then saved out as _model_step.
 - The embedding syncer averages together all the _model_step files and save them individually back as _model

In [3]:
if os.path.exists(trained_model_folder) and os.path.isdir(trained_model_folder):
    # Get a list of all subfolders in language_pair_staging_folder
    subfolders = [folder for folder in os.listdir(trained_model_folder) if os.path.isdir(os.path.join(trained_model_folder, folder))]

    # Filter subfolders that end with "_model"
    model_folders = [folder for folder in subfolders if folder.endswith("_model") or folder.endswith( "_model_step" ) ]

    # Print or use the list of model folders
    print("Folders ending with '_model' or '_step':", model_folders)
else:
    print(f"The folder '{trained_model_folder}' does not exist or is not a directory.")

Folders ending with '_model' or '_step': ['hebrew_model', 'target_model', 'hebrew_model_step', 'greek_model', 'bsb_model']


In [5]:
selected_model_dropdown = Dropdown(options=model_folders)
print( "Select which model to train" )
display(selected_model_dropdown)

Select which model to train


Dropdown(options=('hebrew_model', 'target_model', 'hebrew_model_step', 'greek_model', 'bsb_model'), value='heb…

In [29]:
#training code copied from
#https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, 
        mlm=mlm,
    )
    return data_collator


if use_side_frozen_tokens:
    learned_magic_tokens = {}
    if os.path.exists(trained_embeddings_pickle_file):
        with open(trained_embeddings_pickle_file, 'rb') as file:
            learned_magic_tokens = pickle.load(file)
            print( f"Loaded previouse training from {trained_embeddings_pickle_file}" )
    else:
        print( f"Previouse training not found at {trained_embeddings_pickle_file}" )
    
    class MagicTokensEmbeddingPatch(nn.Module):
        def __init__(self, tokenizer, other_embeddings ):
            super().__init__()
            self.tokenizer = tokenizer
            self.other_embeddings = other_embeddings
    
        def forward( self, x ):
            #print( f"Called forward with x {x}" )
            batch_result_list = []
            for batch in x:
                #print( f"batch is {batch}" )
                inputs_embeds_list = []
                for id in batch:
                    #print( f"id is {id}" )
                    token_string = self.tokenizer.decode( [id] )
                    #print( f"token_string is {token_string}" )
                    if token_string.startswith( '[' ) and token_string.endswith( ']' ):
                        if token_string in learned_magic_tokens:
                            inputs_embeds_list.append( learned_magic_tokens[token_string].detach() )
                        else:
                            #can't drop stuff because it makes the batches not be the same length.
                            #print( f"Dropped {token_string}" )
                            inputs_embeds_list.append( self.other_embeddings( torch.LongTensor(tokenizer.encode("_")).to(x.device) )[0] )
                    else:
                        inputs_embeds_list.append( self.other_embeddings( torch.LongTensor([id]).to(x.device) )[0] )
                        
                inputs_embeds = torch.stack(inputs_embeds_list, dim=0)
                batch_result_list.append(inputs_embeds)
            return torch.stack(batch_result_list, dim=0)

def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps,
          tokenizer=None):
    if tokenizer is None:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)
    
    tokenizer.save_pretrained(output_dir)
      
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    model.save_pretrained(output_dir)

    if use_side_frozen_tokens:
        model.set_input_embeddings( MagicTokensEmbeddingPatch(tokenizer,model.get_input_embeddings()) )
    
    training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          save_total_limit=4,
      )
    
    trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
    )
    
    # if only_train_magic_tokens:
    #     #freeze everything
    #     for param in model.parameters():
    #       param.requires_grad = False
        
    #     #Thaw the magic parameters.
    #     first_magic_token_ids = tokenizer.encode( "[GEN 1:1_a]" )
    #     assert (len(first_magic_token_ids) == 1)
    #     first_magic_token_id = first_magic_token_ids[0]

    #     #assume that the magic tokens are all rest of the tokens.
    #     for token_id in range(first_magic_token_id,len(tokenizer)):
    #         model.get_input_embeddings().weight[token_id].requires_grad = True
    #         model.get_output_embeddings().weight[token_id].requires_grad = True
        
      
    trainer.train()
    trainer.save_model()

Loaded previouse training from ../data/magic_token_folder/magic_tokens_size_1.pickle


In [12]:
selected_model_dropdown.value
target_model = selected_model_dropdown.value.replace( "_model", "" ).replace( "_step", "" )
target_model

'greek'

In [13]:
# you need to set parameters 
train_file_path = os.path.join( language_pair_staging_folder, f"train_{target_model}.txt" )
model_name = os.path.join( trained_model_folder, selected_model_dropdown.value )
output_dir = os.path.join( trained_model_folder, f"{target_model}_model_step" )
overwrite_output_dir = True
per_device_train_batch_size = 2 #8
num_train_epochs = 5.0
save_steps = 500

In [14]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [9]:
output_dir

'./out/hebrew_model_step'

In [30]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    tokenizer=tokenizer,
)



Step,Training Loss


KeyboardInterrupt: 

In [48]:
#Here we loop through and train all the non step models not just the selected one.
# Filter subfolders that end with "_model"
non_step_models = [folder for folder in subfolders if folder.endswith("_model") ]

for model_selection in non_step_models:
    target_model = model_selection.replace( "_model", "" ).replace( "_step", "" )
    train_file_path = os.path.join( language_pair_staging_folder, f"train_{target_model}.txt" )
    model_name = os.path.join( trained_model_folder, selected_model_dropdown.value )
    output_dir = os.path.join( trained_model_folder, f"{target_model}_model_step" )
    print( f"Training model {model_selection} to {output_dir}" )
    train(
        train_file_path=train_file_path,
        model_name=model_name,
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps
    )

Training model greek_model to ./out/greek_model_step




Step,Training Loss
500,2.1375
1000,1.651
1500,1.5463
2000,1.4662
2500,1.4344
3000,1.3783
3500,1.3577
4000,1.3261
4500,1.3123
5000,1.2583


Training model bsb_model to ./out/bsb_model_step




Step,Training Loss
500,3.295
1000,2.4054
1500,2.2816
2000,2.3009
2500,2.2478
3000,2.236
3500,2.2399
4000,2.0888
4500,1.8538
5000,1.8726


Training model target_model to ./out/target_model_step




Step,Training Loss
500,5.1183
1000,4.2139
1500,4.0064
2000,3.785
2500,3.7226
3000,3.6669
3500,3.5407
4000,3.5131
4500,3.4802
5000,3.4159


Training model hebrew_model to ./out/hebrew_model_step




Step,Training Loss
500,0.7838
1000,0.8051
1500,0.8093
2000,0.8248
2500,0.8218
3000,0.8171
3500,0.8162
4000,0.8128
4500,0.8232
5000,0.8171
