In [4]:
# ============================================
# TASK-01: GPT-2 Fine-tuning for Text Generation
# ============================================

# Install libraries
!pip install transformers datasets accelerate -q

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

print("TASK-01: GPT-2 Fine-tuning")
print("=" * 50)

# Download dataset
print("Downloading dataset...")
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('input.txt', 'r') as f:
    text = f.read()

print(f"Dataset size: {len(text):,} characters")
print(f"Sample: {text[:100]}...")

# Prepare data
lines = text.split('\n')[:300]
print(f"Using {len(lines)} lines for training")

# Setup tokenizer and model
print("Loading model and tokenizer...")
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda') # Move model to GPU

# Tokenization
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=64
    )
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    return tokenized_inputs

dataset = Dataset.from_dict({'text': lines})
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
split_data = tokenized_dataset.train_test_split(test_size=0.1)
train_data = split_data['train']
eval_data = split_data['test']

# Training setup
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    fp16=True,
    report_to="none",
)

# Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
)

print("Starting training...")
trainer.train()
print("Training completed!")

# Save model
trainer.save_model("gpt2_finetuned")

# Text generation function
def generate_text(prompt, max_length=80):
    inputs = tokenizer(prompt, return_tensors='pt').to('cuda') # Move input tensors to GPU
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        temperature=0.8,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Generate examples
print("\nGenerated Examples:")
print("=" * 50)

test_prompts = [
    "To be or not to be",
    "Romeo, wherefore art thou",
    "All the world's a stage",
    "Now is the winter",
    "Shall I compare thee"
]

for i, prompt in enumerate(test_prompts, 1):
    generated = generate_text(prompt)
    print(f"\n{i}. Prompt: {prompt}")
    print(f"   Generated: {generated}")

    with open(f"generated_{i}.txt", "w") as f:
        f.write(f"Prompt: {prompt}\nGenerated: {generated}\n")

print("\n" + "=" * 50)
print("Outputs saved: generated_1.txt to generated_5.txt")
print("Model saved: gpt2_finetuned/")
print("=" * 50)

# Download files
from google.colab import files
for i in range(1, 6):
    files.download(f"generated_{i}.txt")

TASK-01: GPT-2 Fine-tuning
Downloading dataset...
Dataset size: 1,115,394 characters
Sample: First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You...
Using 300 lines for training
Loading model and tokenizer...


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Starting training...


Step,Training Loss,Validation Loss
50,1.3698,0.660232
100,0.475,0.617214


Training completed!

Generated Examples:

1. Prompt: To be or not to be
   Generated: To be or not to be?

2. Prompt: Romeo, wherefore art thou
   Generated: Romeo, wherefore art thou,

3. Prompt: All the world's a stage
   Generated: All the world's a stage,

4. Prompt: Now is the winter
   Generated: Now is the winter, the spring

5. Prompt: Shall I compare thee
   Generated: Shall I compare thee well: the head, the heart, the heart

Outputs saved: generated_1.txt to generated_5.txt
Model saved: gpt2_finetuned/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>