In [1]:
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

# Define a custom dataset class for generating poems
class PoemDataset(Dataset):
    def __init__(self, poems, tokenizer, max_length=100):
        self.poems = poems
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.poems)

    def __getitem__(self, idx):
        poem = self.poems[idx]
        input_ids = self.tokenizer.encode(poem, max_length=self.max_length, truncation=True)
        return torch.tensor(input_ids, dtype=torch.long)

# Load poems from a .txt file
def load_poems_from_txt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        poems = file.read().split('\n\n')  # Assuming poems are separated by empty lines
    return poems

# Define path to the .txt file containing poems
txt_file_path = 'poem.txt'

# Load poems from the .txt file
poems = load_poems_from_txt(txt_file_path)

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define training parameters
num_epochs = 50
batch_size = 4
max_length = 200

# Create dataset and data loader
dataset = PoemDataset(poems, tokenizer, max_length=max_length)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set model to training mode

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch.to(model.device)
        outputs = model(input_ids=input_ids, labels=input_ids)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Save trained model checkpoint
torch.save(model.state_dict(), 'poem_generator_model.pth')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Epoch 1/50, Loss: 4.261861324310303
Epoch 2/50, Loss: 3.666614055633545
Epoch 3/50, Loss: 3.4191668033599854
Epoch 4/50, Loss: 1.6676442623138428
Epoch 5/50, Loss: 1.1539918184280396
Epoch 6/50, Loss: 1.0380927324295044
Epoch 7/50, Loss: 0.6716369986534119
Epoch 8/50, Loss: 0.506334662437439
Epoch 9/50, Loss: 0.4820655882358551
Epoch 10/50, Loss: 0.42329683899879456
Epoch 11/50, Loss: 0.433626264333725
Epoch 12/50, Loss: 0.2817703187465668
Epoch 13/50, Loss: 0.15117332339286804
Epoch 14/50, Loss: 0.1675453782081604
Epoch 15/50, Loss: 0.0936296284198761
Epoch 16/50, Loss: 0.12598316371440887
Epoch 17/50, Loss: 0.09780106693506241
Epoch 18/50, Loss: 0.04488404467701912
Epoch 19/50, Loss: 0.046569645404815674
Epoch 20/50, Loss: 0.0459696501493454
Epoch 21/50, Loss: 0.037099238485097885
Epoch 22/50, Loss: 0.020280836150050163
Epoch 23/50, Loss: 0.019192231819033623
Epoch 24/50, Loss: 0.014651709236204624
Epoch 25/50, Loss: 0.01557962130755186
Epoch 26/50, Loss: 0.022949546575546265
Epoch 2