In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, GPT2Config
from datasets import load_dataset
from tokenizer import OptimalGPT2Tokenizer

In [2]:
# Load tokenizer and model
tokenizer = OptimalGPT2Tokenizer.from_pretrained("gpt2")

# Define a new model from scratch
config = GPT2Config()
model = GPT2LMHeadModel(config)

# Ensure tokenizer has a pad token
tokenizer.pad_token = tokenizer.eos_token

# Load dataset from a text file
dataset = load_dataset("text", data_files={"train": "data/paul_graham_essay.txt"})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # No masked language modeling (MLM) for GPT-2
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'OptimalGPT2Tokenizer'.


Len of Vocab Initialized is 50257


Map:   0%|          | 0/351 [00:00<?, ? examples/s]

In [3]:
tokenizer.bpe("policymaker")

'policy maker'

In [4]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_model_optimal",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    num_train_epochs=12,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
    learning_rate=3e-4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
train_result = trainer.train()

# Save final model
model.save_pretrained("./gpt2_model_optimal")
tokenizer.save_pretrained("./gpt2_model_optimal")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,9.2443
20,7.3688
30,6.9984
40,6.811
50,6.4586
60,6.1804
70,6.2439
80,6.1557
90,6.1471
100,6.025


('./gpt2_model_optimal/tokenizer_config.json',
 './gpt2_model_optimal/special_tokens_map.json',
 './gpt2_model_optimal/vocab.json',
 './gpt2_model_optimal/merges.txt',
 './gpt2_model_optimal/added_tokens.json')

In [20]:
# Get the current optimizer
optimizer = trainer.optimizer

# Update the learning rate
for param_group in optimizer.param_groups:
    param_group["lr"] = 1e-4  # Set new learning rate

# Increase the number of epochs by 1
trainer.args.num_train_epochs = 3

# Train the model
train_result = trainer.train()

# Save final model
model.save_pretrained("./gpt2_model_optimal")
tokenizer.save_pretrained("./gpt2_model_optimal")

Step,Training Loss
10,1.5661
20,2.0228
30,1.9324
40,1.893
50,1.8234
60,1.7683
70,1.6917
80,1.5501
90,1.5988
100,1.2253


('./gpt2_model_optimal/tokenizer_config.json',
 './gpt2_model_optimal/special_tokens_map.json',
 './gpt2_model_optimal/vocab.json',
 './gpt2_model_optimal/merges.txt',
 './gpt2_model_optimal/added_tokens.json')

In [24]:
import torch
from transformers import GenerationConfig
def generate_text(prefix, max_length=25):
    inputs = tokenizer(prefix, return_tensors="pt")
    inputs.to("cuda:0")
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id, generation_config=GenerationConfig(
            num_beams=3, do_sample=False
        ))
    return tokenizer.decode(output[0].to("cpu"), skip_special_tokens=True)

# Example usage
prefix = "Before college the two main things"
generated_text = generate_text(prefix)
print(generated_text)


Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays.
