### Initiate Dataset

In [1]:
from datasets import load_dataset

# Load full dataset
dataset = load_dataset("roneneldan/TinyStories", split="train")

# Sample 1% of the dataset
small_dataset = dataset.train_test_split(test_size=0.1, seed=42)["test"]

print(small_dataset)


Dataset({
    features: ['text'],
    num_rows: 211972
})


### Initiate Model and Tokenizer

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a padding token, use EOS instead

### Tokenize Data

In [None]:
from transformers import DataCollatorForLanguageModeling

def tokenize_function(element):
    return tokenizer(element["text"], truncation=True, padding="max_length", max_length=512)

# Tokenize dataset
tokenized_dataset = small_dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

# Create a data collator that dynamically pads batches
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


Map:   0%|          | 0/211972 [00:00<?, ? examples/s]

### Fintune the Model and Save Checkpoints

In [4]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

# Load model
model = GPT2LMHeadModel.from_pretrained('gpt2')

training_args = TrainingArguments(
    output_dir='./checkpoints',  
    num_train_epochs=3,             
    learning_rate=5e-4,
    per_device_train_batch_size=8, 
    gradient_accumulation_steps=8,  
    warmup_steps=200,               
    weight_decay=0.01,              
    logging_first_step=True,        
    
    logging_dir="./checkpoints/logs",
    logging_steps=100,              
    logging_strategy="steps",
    
    report_to="tensorboard",
    
    save_steps=100,                 
    save_total_limit=2,             

    seed=42,                        
    fp16=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save final model and tokenizer
model.save_pretrained("./checkpoints")
tokenizer.save_pretrained("./checkpoints")

Step,Training Loss
1,2.5913
100,2.0449
200,1.8164
300,1.7383
400,1.6752
500,1.644
600,1.6253
700,1.6035
800,1.5829
900,1.5588


('./checkpoints\\tokenizer_config.json',
 './checkpoints\\special_tokens_map.json',
 './checkpoints\\vocab.json',
 './checkpoints\\merges.txt',
 './checkpoints\\added_tokens.json')

### Run Inference

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

def generate_text(prompt, max_length=512, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95, repetition_penalty=1.2, no_repeat_ngram_size=3, device='cpu'):
    # Load the trained model and tokenizer
    model = GPT2LMHeadModel.from_pretrained("./checkpoints").to(device)
    tokenizer = GPT2Tokenizer.from_pretrained("./checkpoints")
    tokenizer.pad_token = tokenizer.eos_token
    
    model.eval()
    
    # Tokenize input prompt
    input = tokenizer(prompt, return_tensors='pt')
    input['input_ids'] = input['input_ids'].to(device)
    input['attention_mask'] = input['attention_mask'].to(device)
    
    # Generate text with added controls to avoid repetition
    output = model.generate(
        **input,
        max_length=max_length,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        do_sample=True,  # Enable sampling for more diverse outputs
        pad_token_id=tokenizer.pad_token_id,  # Explicitly set pad_token_id
        eos_token_id=tokenizer.eos_token_id,  # Stop generation at EOS token
        repetition_penalty=repetition_penalty,  # Apply repetition penalty
        no_repeat_ngram_size=no_repeat_ngram_size,  # Avoid repeating n-grams
    )
    
    # Decode and return generated text
    return [tokenizer.decode(seq, skip_special_tokens=True) for seq in output]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
prompt = "Once upon a time there was a pumpkin."
generated_texts = generate_text(prompt, device=device)
for i, text in enumerate(generated_texts):
    print(f"Generated Text {i+1}:\n{text}\n")


Generated Text 1:
Once upon a time there was a pumpkin. It was very big and round, but it was also very cold. One day the sun shone down on the pumpkin and made it shiver. Suddenly, the pumpkin started to shiver because of its cold. 

The farmer came outside and saw the pumpkin shivering. He knew he had to do something to make it warmer. He took out his warm mittens and started to knit a sweater for the pumpkin.  The farmer worked hard all day and soon enough he finished the sweater.  
 
When the mittens were finished, the farmer put them on the pumpkin. When he went inside, he told everyone that the pumpkin looked so much better now. Everyone was so happy and excited! From then on they always kept their eyes open when playing near the pumpkin! The end. 


The End. So, everyone went back to the warm and cozy pumpkin. They could still feel the warmth of the cold, but they knew that with love and care they would be able to keep on shivering and feeling better. And they all lived happily 