In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2Config
from datasets import load_dataset
import matplotlib.pyplot as plt

In [9]:
# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define a new model from scratch
config = GPT2Config()
model = GPT2LMHeadModel(config)

# Ensure tokenizer has a pad token
tokenizer.pad_token = tokenizer.eos_token

# Load dataset from a text file
dataset = load_dataset("text", data_files={"train": "data/paul_graham_essay.txt"})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # No masked language modeling (MLM) for GPT-2
)

In [41]:
tokenizer.bpe("policymaker")

'p olic ym aker'

In [3]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_model",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
    learning_rate=1e-4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
train_result = trainer.train()

# Save final model
model.save_pretrained("./gpt2_model")
tokenizer.save_pretrained("./gpt2_model")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,9.5589
20,8.5899
30,8.0278
40,7.4384
50,7.0965
60,6.4867
70,6.4377
80,6.2561
90,6.2191
100,6.1107


('./gpt2_model/tokenizer_config.json',
 './gpt2_model/special_tokens_map.json',
 './gpt2_model/vocab.json',
 './gpt2_model/merges.txt',
 './gpt2_model/added_tokens.json')

In [38]:
# Get the current optimizer
optimizer = trainer.optimizer

# Update the learning rate
for param_group in optimizer.param_groups:
    param_group["lr"] = 3e-4  # Set new learning rate

# Increase the number of epochs by 1
trainer.args.num_train_epochs = 3

# Continue training
train_result = trainer.train()

# Save updated model
model.save_pretrained("./gpt2_model")
tokenizer.save_pretrained("./gpt2_model")


Step,Training Loss
10,1.3787
20,2.0282
30,2.1213
40,2.0558
50,1.8234
60,1.5763
70,1.7366
80,1.6629
90,1.6993
100,1.3455


('./gpt2_model/tokenizer_config.json',
 './gpt2_model/special_tokens_map.json',
 './gpt2_model/vocab.json',
 './gpt2_model/merges.txt',
 './gpt2_model/added_tokens.json')

In [13]:
import torch
from transformers import GenerationConfig
def generate_text(prefix, max_length=25):
    inputs = tokenizer(prefix, return_tensors="pt")
    inputs.to("cuda:0")
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id, generation_config=GenerationConfig(
            num_beams=3, do_sample=False
        ))
    return tokenizer.decode(output[0].to("cpu"), skip_special_tokens=True)

# Example usage
prefix = "Before college the two main things"
generated_text = generate_text(prefix)
print(generated_text)


Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays.
