In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling

# 1. Dataset Preparation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='./Anthropic_HH_Golden/hh_golden/train.jsonl',
    block_size=128  # Maximum sequence length
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 2. Model Architecture
model = GPT2LMHeadModel.from_pretrained('gpt2')

# 3. Model Training
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./output',
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

trainer.train()

# 4. Evaluation
eval_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='./Anthropic_HH_Golden/hh_golden/test.jsonl',
    block_size=128
)

eval_results = trainer.evaluate(eval_dataset)
print(f'Perplexity: {eval_results["eval_loss"]}')

# 5. Conditional Text Generation
from transformers import pipeline

text_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
prompt = "Once upon a time"
output = text_generator(prompt, max_length=100, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=3)

# Print generated text
for text in output:
    print(text['generated_text'])




dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/74655 [00:00<?, ?it/s]