In [17]:
from transformers import Trainer
from datasets import load_dataset
import torch

ds = load_dataset("Gustavosta/Stable-Diffusion-Prompts")

In [18]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['Prompt'],
        num_rows: 73718
    })
    test: Dataset({
        features: ['Prompt'],
        num_rows: 8192
    })
})

In [20]:
def tokenize_function(examples):
    # Tokenize both the 'short_prompt' and 'long_prompt' columns
    #short_prompt_tokens = tokenizer(examples["short_prompt"], padding="max_length", truncation=True)
    prompt_tokens = tokenizer(examples["Prompt"], padding="max_length", truncation=True)
    
    # Merge both tokenized outputs into a single dictionary (or add them separately)
    return {
        'input_ids': prompt_tokens['input_ids'],
        'attention_mask': prompt_tokens['attention_mask']
    }

# Tokenize the entire dataset (train and test)
tokenized_datasets = ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/8192 [00:00<?, ? examples/s]

In [21]:
def preprocess_data(examples):
    model_inputs = {
        'input_ids': examples['input_ids'],
        'attention_mask': examples['attention_mask'],
        'labels': examples['input_ids']  # The input_ids are the target labels
    }
    return model_inputs

# Apply the preprocessing to the train and test datasets
tokenized_datasets = tokenized_datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/8192 [00:00<?, ? examples/s]

In [22]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['Prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 73718
    })
    test: Dataset({
        features: ['Prompt', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8192
    })
})

In [23]:
tokenized_datasets=tokenized_datasets.remove_columns('Prompt')

In [24]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt_results",              # Save the model here
    evaluation_strategy="epoch",         # Evaluate after each epoch
    learning_rate=2e-5,                  # Learning rate
    per_device_train_batch_size=8,       # Training batch size
    per_device_eval_batch_size=8,        # Evaluation batch size
    num_train_epochs=2,                  # Number of epochs
    weight_decay=0.01,                   # Regularization
    save_steps=10000,
    logging_steps=5000                   # Log every 500 steps
)




In [25]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
) 


In [26]:
trainer.train() 

  0%|          | 0/18430 [00:00<?, ?it/s]

{'loss': 0.1643, 'grad_norm': 0.37413522601127625, 'learning_rate': 1.4574064026044494e-05, 'epoch': 0.54}


  0%|          | 0/1024 [00:00<?, ?it/s]

{'eval_loss': 0.11842742562294006, 'eval_runtime': 192.9537, 'eval_samples_per_second': 42.456, 'eval_steps_per_second': 5.307, 'epoch': 1.0}
{'loss': 0.1301, 'grad_norm': 0.41605913639068604, 'learning_rate': 9.148128052088985e-06, 'epoch': 1.09}
{'loss': 0.1199, 'grad_norm': 0.277890682220459, 'learning_rate': 3.722192078133479e-06, 'epoch': 1.63}


  0%|          | 0/1024 [00:00<?, ?it/s]

{'eval_loss': 0.1103677898645401, 'eval_runtime': 192.9685, 'eval_samples_per_second': 42.453, 'eval_steps_per_second': 5.307, 'epoch': 2.0}
{'train_runtime': 11616.939, 'train_samples_per_second': 12.691, 'train_steps_per_second': 1.586, 'train_loss': 0.1339884666405614, 'epoch': 2.0}


TrainOutput(global_step=18430, training_loss=0.1339884666405614, metrics={'train_runtime': 11616.939, 'train_samples_per_second': 12.691, 'train_steps_per_second': 1.586, 'total_flos': 7.7047704059904e+16, 'train_loss': 0.1339884666405614, 'epoch': 2.0})

In [27]:
trainer.evaluate() 

  0%|          | 0/1024 [00:00<?, ?it/s]

{'eval_loss': 0.1103677898645401,
 'eval_runtime': 192.3492,
 'eval_samples_per_second': 42.589,
 'eval_steps_per_second': 5.324,
 'epoch': 2.0}

In [28]:
model.save_pretrained("./fine_tuned_gpt_new_data")
tokenizer.save_pretrained("./fine_tuned_gpt_new_data")


('./fine_tuned_gpt_new_data/tokenizer_config.json',
 './fine_tuned_gpt_new_data/special_tokens_map.json',
 './fine_tuned_gpt_new_data/vocab.json',
 './fine_tuned_gpt_new_data/merges.txt',
 './fine_tuned_gpt_new_data/added_tokens.json',
 './fine_tuned_gpt_new_data/tokenizer.json')

: 