!pip install ipykernel datasets pytorch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 accelerate torch==2.6.0+cu124 tensorflow tf-keras
!pip install -U transformers

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict
import torch
import pandas as pd

In [None]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
# Add special tokens
special_tokens_dict = {
    'pad_token': '<|pad|>',
    'bos_token': '<|startofpoem|>',
    'eos_token': '<|endofpoem|>'
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = '<|pad|>'


In [None]:
mport pandas as pd
from datasets import Dataset

# Load structured data (e.g., from CSV)
df = pd.read_csv("PoetryFoundationData.csv")

# Drop short poems or empty entries
df = df[df["Poem"].str.len() > 50]
df.fillna("", inplace=True)

# Format input text clearly with special tokens
def format_row(row):
    return (
        f"<|startofpoem|>\n"
        f"Title: {row['Title'].strip()}\n"
        f"Poet: {row['Poet'].strip()}\n"
        f"Tags: {row['Tags'].strip()}\n\n"
        f"{row['Poem'].strip()}\n"
        f"<|endofpoem|>"
    )

df["text"] = df.apply(format_row, axis=1)
dataset = Dataset.from_dict({"text": df["text"].tolist()})

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=768,
        return_attention_mask=True
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


In [None]:
from datasets import DatasetDict

train_size = int(0.8 * len(tokenized_dataset))
train_dataset = tokenized_dataset.select(range(train_size))
valid_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset
})

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
from transformers import Trainer, TrainingArguments
import torch

# 1. Resize embeddings for added tokens
model.resize_token_embeddings(len(tokenizer))

# 2. Training arguments with tweaks for better creativity & training stability
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=10,                      # More epochs for better learning
    per_device_train_batch_size=4,           # Larger batch size if GPU allows
    gradient_accumulation_steps=2,           # Effective batch size = 8
    save_steps=1000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    warmup_steps=200,                        # More warmup for smoother LR ramp-up
    weight_decay=0.01,
    learning_rate=3e-5,                      # Slightly lower LR for stability
    metric_for_best_model="loss",            # Use loss to pick best model
    greater_is_better=False,
    
)

# 3. Enable gradient checkpointing to save memory during training
model.gradient_checkpointing_enable()

# 4. Optional: Compute perplexity for evaluation (better than empty function)
import torch.nn.functional as F
import math

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Shift labels to align with predictions
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    perplexity = math.exp(loss.item())
    return {"loss": loss.item(), "perplexity": perplexity}

# 5. Initialize Trainer with all components
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 6. Start training
trainer.train()

# 7. Save final model and tokenizer
model.save_pretrained("./fine_tuned_poetry_model_v3")
tokenizer.save_pretrained("./fine_tuned_poetry_model_v3")

In [None]:
import os
import csv
import torch
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define scoring function
def score_poem(poem_text, model, tokenizer):
    poem_text = f"<|startofpoem|>\n{poem_text}\n<|endofpoem|>"
    
    inputs = tokenizer(poem_text, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        perplexity = torch.exp(loss)
    
    return {
        "loss": loss.item(),
        "perplexity": perplexity.item()
    }

# Load fine-tuned model and tokenizer
model_path = './fine_tuned_poetry_model_v3'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Set special tokens
special_tokens_dict = {
    'pad_token': '<|pad|>',
    'bos_token': '<|startofpoem|>',
    'eos_token': '<|endofpoem|>'
}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
tokenizer.pad_token = '<|pad|>'

# Create output directory
output_dir = os.path.join(model_path, 'generated_poems')
os.makedirs(output_dir, exist_ok=True)

# Prompt info
title = "Nature and Humanity"
poet = "myself"
tags = "Peace, Hope, Joy, Struggle, Victory, Unity, Diversity"

prompt = (
    f"<|startofpoem|>\n"
    f"Title: {title}\n"
    f"Poet: {poet}\n"
    f"Tags: {tags}\n\n"
)

inputs = tokenizer(prompt, return_tensors="pt")

# Generate poems
outputs = model.generate(
    inputs['input_ids'],
    max_length=150,
    temperature=0.7,
    top_p=0.8,
    top_k=100,
    repetition_penalty=1.5,
    do_sample=True,
    num_return_sequences=50,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.convert_tokens_to_ids("<|endofpoem|>"),
)

# Save poems to CSV
csv_path = os.path.join(output_dir, 'generated_poems.csv')
average_perplexity = 0.0

with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Poem Number', 'Title', 'Poet', 'Tags', 'Poem', 'Loss', 'Perplexity'])

    for i, output in enumerate(outputs):
        poem = tokenizer.decode(output, skip_special_tokens=True).strip()
        score = score_poem(poem, model, tokenizer)
        average_perplexity += score['perplexity']

        writer.writerow([
            f"Poem #{i+1}",
            title,
            poet,
            tags,
            poem.replace('\n', '\\n'),  # Escape newlines for CSV
            f"{score['loss']:.4f}",
            f"{score['perplexity']:.4f}"
        ])

        # Print to console
        print(f"Poem #{i+1}:\n{poem}\n")
        print(f"Score: Loss = {score['loss']:.4f}, Perplexity = {score['perplexity']:.4f}\n")
        print("="*100 + "\n")

# Final average perplexity
average_perplexity /= len(outputs)
print(f"Average Perplexity: {average_perplexity:.4f}")