<a href="https://colab.research.google.com/github/preekshitsaklani/gpt2-textgen-finetune/blob/main/Text_Generation_using_GPT_2_(Fine_Tuning_it).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GPT-2 Fine-tuning with Hugging Face Transformers & PyTorch

## Using WikiText-2 dataset from Kaggle (downloaded via kagglehub)

In [None]:
!pip install -q transformers datasets kagglehub nltk torch torchvision torchaudio
!pip install -q numpy==1.26.4

## Libraries

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

import nltk
import torch
from datasets import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import kagglehub
import gc
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## Helper Function to Find File

In [None]:
def find_file(root, filename):
    for dirpath, _, files in os.walk(root):
        if filename in files:
            return os.path.join(dirpath, filename)
    raise FileNotFoundError(f"{filename} not found in {root}")

## Locate Dataset Files

In [None]:
train_file = find_file(path, "wiki.train.tokens")
test_file = find_file(path, "wiki.test.tokens")
valid_file = find_file(path, "wiki.valid.tokens")

## Print File Paths

In [None]:
print(f"Train file: {train_file}")
print(f"Test file: {test_file}")
print(f"Valid file: {valid_file}")

## Download NLTK Data

In [None]:
nltk.download('punkt', quiet=True)

## Load Text Data

In [None]:
def load_text_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    lines = [line.strip() for line in content.split('\n') if line.strip() and len(line.strip()) > 10]
    return lines

## Load and Process Text Files

In [None]:
train_texts = load_text_file(train_file)
test_texts = load_text_file(test_file)
valid_texts = load_text_file(valid_file)

print(f"Train samples: {len(train_texts)}")
print(f"Test samples: {len(test_texts)}")
print(f"Valid samples: {len(valid_texts)}")

## Limit Dataset Size (for faster experimentation)

In [None]:
train_texts = train_texts[:5000]
valid_texts = valid_texts[:500]

print(f"Using {len(train_texts)} training samples and {len(valid_texts)} validation samples")

## Load Pre-trained GPT-2 Model and Tokenizer

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

## Set Padding Token

In [None]:
tokenizer.pad_token = tokenizer.eos_token

## Move Model to Device and Print Info

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print(f"Model loaded on: {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

## Create Dataset Function

In [None]:
def create_dataset(texts, tokenizer, max_length=128):
    """Create a dataset from text samples"""

    def tokenize_function(examples):
        result = tokenizer(
            examples['text'],
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        result['labels'] = result['input_ids'].clone()
        return result

    dataset = Dataset.from_dict({'text': texts})
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['text']
    )

    return tokenized_dataset

## Tokenize Datasets

In [None]:
train_dataset = create_dataset(train_texts, tokenizer)
valid_dataset = create_dataset(valid_texts, tokenizer)

print(f"Tokenized train dataset: {len(train_dataset)}")
print(f"Tokenized valid dataset: {len(valid_dataset)}")

## Initialize Data Collator

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

## Define Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./gpt2-wikitext2",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=400,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    report_to="none",
)

## Initialize Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
)

## Start Training

In [None]:
print("Starting training...")
trainer.train()

## Save Fine-tuned Model and Tokenizer

In [None]:
print("Saving fine-tuned model...")
trainer.save_model("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2")
print("Model saved successfully!")

## Text Generation and Model Loading Functions

In [None]:
def generate_text(prompt, model, tokenizer, max_length=100, temperature=0.8, top_p=0.9, top_k=50):
    """Generate text using the fine-tuned model"""
    model.eval()

    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generating answer
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            do_sample=True,
            top_k=top_k,
            top_p=top_p,
            temperature=temperature,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            no_repeat_ngram_size=2
        )

    # Decoding
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

def load_fine_tuned_model(model_path="./fine-tuned-gpt2"):
    """Load the fine-tuned model for inference"""
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to(device)
    return model, tokenizer

## Test Text Generation

In [None]:
print("Testing text generation...")

# Testing
test_prompts = [
    "Artificial Intelligence is",
    "The future of technology will",
    "Machine learning algorithms can",
    "In the world of science,",
    "The most important discovery in"
]

# Generating answer for each prompt
for prompt in test_prompts:
    print(f"\nPrompt: '{prompt}'")
    print("=" * 50)
    generated = generate_text(prompt, model, tokenizer, max_length=80)
    print(generated)
    print("=" * 50)

## Interactive Text Generation Function

In [None]:
def interactive_generation():
    """Interactive text generation with user input"""
    print("Interactive Text Generation (type 'quit' to exit)")
    print("=" * 50)

    while True:
        prompt = input("\nEnter your prompt: ")
        if prompt.lower() == 'quit':
            break

        print(f"\nGenerating text for: '{prompt}'")
        print("-" * 40)

        generated = generate_text(
            prompt,
            model,
            tokenizer,
            max_length=120,
            temperature=0.7,
            top_p=0.9,
            top_k=40
        )

        print(generated)

# Uncomment the next line to run interactive generation
# interactive_generation()

## Evaluate Model Function

In [None]:
def evaluate_model(model, tokenizer, test_dataset, data_collator):
    """Evaluate the model on test dataset"""
    print("Evaluating model...")

    # Creating a trainer
    eval_trainer = Trainer(
        model=model,
        eval_dataset=test_dataset,
        data_collator=data_collator,
    )

    # Evaluating
    eval_results = eval_trainer.evaluate()

    print(f"Evaluation results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value:.4f}")

    return eval_results

## Create Test Dataset

In [None]:
test_dataset = create_dataset(test_texts[:500], tokenizer)

## Evaluate Model

In [None]:
evaluation_results = evaluate_model(model, tokenizer, test_dataset, data_collator)

## Clean Up and Confirm Completion

In [None]:
del trainer
torch.cuda.empty_cache()
gc.collect()

print("Training completed successfully!")
print(f"Fine-tuned model saved to: ./fine-tuned-gpt2")
print("You can now use the model for text generation!")