In [15]:
# Install the required libraries
!pip install torch
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [6]:
from transformers import pipeline

# Define the model name
model_name = "Chetan007/gpt2-sonnet-generators"

# Create a text generation pipeline
generator = pipeline('text-generation', model=model_name, device=0)  # Set device=0 for GPU, or device=-1 for CPU

# Generate a sonnet
sonnet = generator("Love is a vast sea", max_length=100, num_return_sequences=1)[0]['generated_text']

# Print the generated sonnet
print(sonnet)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Love is a vast sea of knowledge, and all too true is the desire to know more, to know more and more, to find more and more, to know more and more, to find more and to find more, that is, to know more and more, to find more and to find more, that is, to find more and to find more, and to find more and to find more, and to find more and to find more, and to find more and to find more,


In [8]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import csv

# Define the GPT-Neo model and tokenizer
model_name = "Chetan007/gpt2-sonnet-generators"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a custom dataset to load sonnets from a CSV file
class SonnetDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.sonnets = self.load_sonnets(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_sonnets(self, csv_file):
        sonnets = []
        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                sonnet = row[0].strip()  # Assuming sonnet is in the first column
                if sonnet:
                    sonnets.append(sonnet)
        return sonnets

    def __len__(self):
        return len(self.sonnets)

    def __getitem__(self, idx):
        sonnet = self.sonnets[idx]
        encoding = self.tokenizer(sonnet, return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length")
        return {key: encoding[key][0] for key in encoding}

# Set up your dataset and DataLoader
csv_file_path = "shakespeare_sonnets.csv"
sonnet_dataset = SonnetDataset(csv_file_path, tokenizer)
train_loader = DataLoader(sonnet_dataset, batch_size=4, shuffle=True)

# Set up training parameters
num_epochs = 1
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in tqdm(train_loader, desc="Training"):
        inputs = {key: batch[key].to(device) for key in batch}
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Generate a sonnet
prompt = "Love is as fair as a flower"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.8)

# Decode and print the generated sonnet
generated_sonnet = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Sonnet:")
print(generated_sonnet)


Epoch 1/1


Training: 100%|██████████| 39/39 [00:01<00:00, 20.88it/s]
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Sonnet:
Love is as fair as a flower is fair, whereon thy fair face doth lie, that thou mayst take delight in, and in that which thou dost decease:   But wherefore do I not say, 'If thou dost not, I do not approve of my mistress' fair complexion'?  I grant, she is thy mistress, though I never saw her in my life.  And yet I dare not tell her my name, lest she should know that I


In [43]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import csv

# Define the GPT-Neo model and tokenizer
model_name = "Chetan007/gpt2-sonnet-generators"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


# Set pad_token_id to eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a custom dataset to load sonnets from a CSV file
class SonnetDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.sonnets = self.load_sonnets(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_sonnets(self, csv_file):
        sonnets = []
        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                sonnet = row[0].strip()  # Assuming sonnet is in the first column
                if sonnet:
                    sonnets.append(sonnet)
        return sonnets

    def __len__(self):
        return len(self.sonnets)

    def __getitem__(self, idx):
        sonnet = self.sonnets[idx]
        encoding = self.tokenizer(sonnet, return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length")
        return {key: encoding[key][0] for key in encoding}

# Set up your dataset and DataLoader
csv_file_path = "shakespeare_sonnets.csv"
sonnet_dataset = SonnetDataset(csv_file_path, tokenizer)
train_loader = DataLoader(sonnet_dataset, batch_size=8, shuffle=True)

# Set up training parameters
num_epochs = 10
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in tqdm(train_loader, desc="Training"):
        inputs = {key: batch[key].to(device) for key in batch}
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    scheduler.step()
    
# Save the trained model
output_model_path = "finetuned_model.p"
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

Epoch 1/10


Training: 100%|██████████| 20/20 [00:01<00:00, 13.35it/s]


Epoch 2/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.30it/s]


Epoch 3/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 4/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 5/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 6/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 7/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 8/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 9/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


Epoch 10/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.31it/s]


('finetuned_model.p/tokenizer_config.json',
 'finetuned_model.p/special_tokens_map.json',
 'finetuned_model.p/vocab.json',
 'finetuned_model.p/merges.txt',
 'finetuned_model.p/added_tokens.json')

In [34]:
!pip install syllables

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting syllables
  Downloading syllables-1.0.9-py3-none-any.whl.metadata (2.4 kB)
Downloading syllables-1.0.9-py3-none-any.whl (15 kB)
Installing collected packages: syllables
Successfully installed syllables-1.0.9


In [38]:
# Generate a sonnet
prompt = "Love is a fair flower"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.2)

import syllables

def post_process_sonnet(generated_sonnet):
    # Split the text into lines based on syllable count (10 syllables per line)
    syllables_per_line = 10
    words = generated_sonnet.split()
    lines = [" ".join(words[i:i+syllables_per_line]) for i in range(0, len(words), syllables_per_line)]

    # Capitalize the first word of each line
    lines = [line.capitalize() for line in lines]

    # Ensure each line has exactly 10 syllables
    for i, line in enumerate(lines):
        current_syllables = sum(syllables.estimate(word) for word in line.split())
        if current_syllables > syllables_per_line:
            # Truncate words if the line has more than 10 syllables
            words_in_line = line.split()
            while current_syllables > syllables_per_line:
                # Remove the last word until the line has exactly 10 syllables
                last_word = words_in_line.pop()
                current_syllables -= syllables.estimate(last_word)
            lines[i] = " ".join(words_in_line)
        elif current_syllables < syllables_per_line:
            # Pad words if the line has fewer than 10 syllables
            lines[i] = line + " " + " ".join(["<PAD>" for _ in range(syllables_per_line - current_syllables)])

    # Filter out unwanted characters and remove empty lines
    allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',.?! ")
    lines = ["".join(c for c in line if c in allowed_characters) for line in lines]
    lines = [line for line in lines if line]  # Remove empty lines

    # Join the lines to form the sonnet
    formatted_sonnet = "\n".join(lines)

    return formatted_sonnet

# Decode and print the generated sonnet
generated_sonnet = tokenizer.decode(output[0], skip_special_tokens=True)
formatted_sonnet = post_process_sonnet(generated_sonnet)
print("Formatted Sonnet:")
print(formatted_sonnet)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Formatted Sonnet:
Love is a fair flower, which in meadows
And sweetpied flowers in the brocade
Me, and sweet with you. o, what sweetest you,
You are, when i first laid my hand on
And all alone heaven's sweet pride
My newfangledies and put them in my
You have grown so fond of me then you must
Grow so PAD PAD PAD PAD PAD PAD PAD PAD


In [44]:
import syllables

# Set the model to evaluation mode
model.eval()

# Generate sonnet
generated_lines = []
for _ in range(14):  # Generate 14 lines
    prompt = "Love"  # You can change the prompt as needed
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # Ensure attention_mask is set
    attention_mask = torch.ones(input_ids.shape, device=device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass attention_mask
        max_length=100,  # Adjust as needed
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=1.0,  # Set to 1.0 for greedy decoding
        temperature=1.0  # Set to 1.0 for greedy decoding
    )

    # Decode and format sonnet
    sonnet_text = tokenizer.decode(output[0], skip_special_tokens=True)
    sonnet_lines = sonnet_text.split("\n")

    # Filter out lines with syllable count other than 10
    sonnet_lines = [line.strip() for line in sonnet_lines if syllables.estimate(line) == 10]

    generated_lines.extend(sonnet_lines)

# Join the lines to form the sonnet
formatted_sonnet = "\n".join(generated_lines)

# Print and save the generated sonnet
print("Generated Sonnet:")
print(formatted_sonnet)

# Save the trained model
output_model_path = "finetuned_model_sonnet.p"
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)




Generated Sonnet:



('finetuned_model_sonnet.p/tokenizer_config.json',
 'finetuned_model_sonnet.p/special_tokens_map.json',
 'finetuned_model_sonnet.p/vocab.json',
 'finetuned_model_sonnet.p/merges.txt',
 'finetuned_model_sonnet.p/added_tokens.json')

In [45]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, GPT2Config
import torch

# Load GPT-Neo model and tokenizer
model_name = "Chetan007/gpt2-sonnet-generators"
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up a GPT2Config object for generation
generation_config = GPT2Config.from_pretrained(model_name)

# Generate sonnet
generated_lines = []
for _ in range(14):  # Generate 14 lines
    prompt = "Love"  # You can change the prompt as needed
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    
    # Ensure attention_mask is set
    attention_mask = torch.ones(input_ids.shape, device=device)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass attention_mask
        max_length=100,  # Adjust as needed
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=1.0,  # Set to 1.0 for greedy decoding
        temperature=1.0,  # Set to 1.0 for greedy decoding
        config=generation_config  # Pass the GPT2Config for generation
    )

    # Decode and format sonnet
    sonnet_text = tokenizer.decode(output[0], skip_special_tokens=True)
    sonnet_lines = sonnet_text.split("\n")

    # Filter out lines with a syllable count other than 10
    sonnet_lines = [line.strip() for line in sonnet_lines if syllables.estimate(line) == 10]

    generated_lines.extend(sonnet_lines)


You are using a model of type gpt_neo to instantiate a model of type gpt2. This is not supported for all configurations of models and can yield errors.


ValueError: The following `model_kwargs` are not used by the model: ['config'] (note: typos in the generate arguments will also show up in this list)

In [50]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import csv

# Define the GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token


# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define a custom dataset to load sonnets from a CSV file
class SonnetDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.sonnets = self.load_sonnets(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def load_sonnets(self, csv_file):
        sonnets = []
        with open(csv_file, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                sonnet = row[0].strip()  # Assuming sonnet is in the first column
                if sonnet:
                    sonnets.append(sonnet)
        return sonnets

    def __len__(self):
        return len(self.sonnets)

    def __getitem__(self, idx):
        sonnet = self.sonnets[idx]
        encoding = self.tokenizer(sonnet, return_tensors="pt", truncation=True, max_length=self.max_length, padding="max_length")
        return {key: encoding[key][0] for key in encoding}

# Set up your dataset and DataLoader
csv_file_path = "shakespeare_sonnets.csv"  # Adjust the path to your sonnet dataset
sonnet_dataset = SonnetDataset(csv_file_path, tokenizer)
train_loader = DataLoader(sonnet_dataset, batch_size=8, shuffle=True)

# Set up training parameters
num_epochs = 10
learning_rate = 5e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    for batch in tqdm(train_loader, desc="Training"):
        inputs = {key: batch[key].to(device) for key in batch}
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    scheduler.step()

# Save the trained model
output_model_path = "finetuned_model"
model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_model_path)

Epoch 1/10


Training: 100%|██████████| 20/20 [00:01<00:00, 13.41it/s]


Epoch 2/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.41it/s]


Epoch 3/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.42it/s]


Epoch 4/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.43it/s]


Epoch 5/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.43it/s]


Epoch 6/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.43it/s]


Epoch 7/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.42it/s]


Epoch 8/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.43it/s]


Epoch 9/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.44it/s]


Epoch 10/10


Training: 100%|██████████| 20/20 [00:01<00:00, 14.43it/s]


('finetuned_model/tokenizer_config.json',
 'finetuned_model/special_tokens_map.json',
 'finetuned_model/vocab.json',
 'finetuned_model/merges.txt',
 'finetuned_model/added_tokens.json')

In [52]:
# Generate a sonnet using the fine-tuned model with TextGenerationConfig
config = GPT2Config.from_pretrained(output_model_path)
config.max_length = 14 * 10  # 14 lines with 10 syllables each
config.num_return_sequences = 1  # Number of sequences to generate
model = GPT2LMHeadModel.from_pretrained(output_model_path, config=config)
model.to(device)  # Move the model back to the device

# Generate a sonnet using the fine-tuned model
prompt = "Love is a fair flower"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
output = model.generate(input_ids, max_length=config.max_length, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.2)

import syllables

def post_process_sonnet(generated_sonnet):
    # Split the text into lines based on syllable count (10 syllables per line)
    syllables_per_line = 10
    words = generated_sonnet.split()
    lines = [" ".join(words[i:i+syllables_per_line]) for i in range(0, len(words), syllables_per_line)]

    # Capitalize the first word of each line
    lines = [line.capitalize() for line in lines]

    # Ensure each line has exactly 10 syllables
    for i, line in enumerate(lines):
        current_syllables = sum(syllables.estimate(word) for word in line.split())
        if current_syllables > syllables_per_line:
            # Truncate words if the line has more than 10 syllables
            words_in_line = line.split()
            while current_syllables > syllables_per_line:
                # Remove the last word until the line has exactly 10 syllables
                last_word = words_in_line.pop()
                current_syllables -= syllables.estimate(last_word)
            lines[i] = " ".join(words_in_line)
        elif current_syllables < syllables_per_line:
            # Pad words if the line has fewer than 10 syllables
            lines[i] = line + " " + " ".join(["<PAD>" for _ in range(syllables_per_line - current_syllables)])

    # Filter out unwanted characters and remove empty lines
    allowed_characters = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',.?! ")
    lines = ["".join(c for c in line if c in allowed_characters) for line in lines]
    lines = [line for line in lines if line]  # Remove empty lines

    # Join the lines to form the sonnet
    formatted_sonnet = "\n".join(lines)

    return formatted_sonnet

# Decode and print the generated sonnet
generated_sonnet = tokenizer.decode(output[0], skip_special_tokens=True)
formatted_sonnet = post_process_sonnet(generated_sonnet)
print("Formatted Sonnet:")
print(formatted_sonnet)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Formatted Sonnet:
Love is a fair flower, but not so sweet
Thy sweetest buds, which, when ripe, are
Is thy beauty so great as mine, nor
Nor my own worth so high as theirs but thou,
Thyself, my love'st self, thyself's
Of all fairies. o, if i may say so,
Love thee more than i do love thy self.' 'tis
True,' i say, 'tis true' PAD PAD PAD PAD PAD
