In [14]:
from transformers import Trainer

In [15]:
from datasets import load_dataset

ds = load_dataset("gokaygokay/prompt-enhancer-dataset")

I think we only need to use the long prompt and train an unsupervised model. 

In [16]:
# Loading the GPT2 tokenizer and model

from transformers import AutoTokenizer, AutoModelForCausalLM
# AutoModelForCausalLM converts the embedding output of your model to a word. 
# If you use AutoModel, it doesn't convert to a word; only spits out the embedding.

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") 

In [17]:
tokenizer.pad_token = tokenizer.eos_token
# Apparently we need to add a 'pad token' to the tokenizer. IDK what that means though.

In [18]:
def tokenize_function(examples):
    # Tokenize both the 'short_prompt' and 'long_prompt' columns
    short_prompt_tokens = tokenizer(examples["short_prompt"], padding="max_length", truncation=True)
    long_prompt_tokens = tokenizer(examples["long_prompt"], padding="max_length", truncation=True)
    
    # Merge both tokenized outputs into a single dictionary (or add them separately)
    return {
        'short_prompt_input_ids': short_prompt_tokens['input_ids'],
        'short_prompt_attention_mask': short_prompt_tokens['attention_mask'],
        'long_prompt_input_ids': long_prompt_tokens['input_ids'],
        'long_prompt_attention_mask': long_prompt_tokens['attention_mask']
    }

# Tokenize the entire dataset (train and test)
tokenized_datasets = ds.map(tokenize_function, batched=True)


Map:   0%|          | 0/1790 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask'],
        num_rows: 16110
    })
    test: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask'],
        num_rows: 1790
    })
})

In [21]:
def preprocess_data(examples):
    # The model takes input_ids and attention_mask as inputs, and long_prompt_input_ids as labels
    model_inputs = {
        'input_ids': examples['short_prompt_input_ids'],
        'attention_mask': examples['short_prompt_attention_mask'],
        'labels': examples['long_prompt_input_ids']  # The long_prompt_input_ids are the target labels
    }
    return model_inputs

# Apply the preprocessing to the train and test datasets
tokenized_datasets = tokenized_datasets.map(preprocess_data, batched=True)

Map:   0%|          | 0/16110 [00:00<?, ? examples/s]

Map:   0%|          | 0/1790 [00:00<?, ? examples/s]

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",              # Where to save the final model
    evaluation_strategy="epoch",         # Evaluate after every epoch
    learning_rate=2e-5,                  # Learning rate
    per_device_train_batch_size=8,       # Batch size for training
    per_device_eval_batch_size=8,        # Batch size for evaluation
    num_train_epochs=3,                  # Number of training epochs
    weight_decay=0.01,                   # Weight decay for regularization
    logging_dir="./logs",                # Directory for logs
    logging_steps=500,                   # Frequency of logging
    remove_unused_columns=False
)



In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)


In [28]:
trainer.train()

  0%|          | 0/6042 [00:00<?, ?it/s]

TypeError: GPT2LMHeadModel.forward() got an unexpected keyword argument 'short_prompt_input_ids'

In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16110
    })
    test: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1790
    })
})

In [25]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 16110
    })
    test: Dataset({
        features: ['short_prompt', 'long_prompt', 'short_prompt_input_ids', 'short_prompt_attention_mask', 'long_prompt_input_ids', 'long_prompt_attention_mask', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1790
    })
})