In [None]:
!pip install -q transformers datasets torch tqdm matplotlib tiktoken evaluate nltk accelerate
!pip install -q huggingface_hub

In [4]:
import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_scheduler,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

from datasets import load_dataset, Dataset as HFDataset
import evaluate

In [11]:
# Set the seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Check if GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# 2. Load the BBC News Summary Dataset
# ------------------------------------

bbc_dataset = load_dataset("gopalkalpande/bbc-news-summary")
print(bbc_dataset)

# Display a sample
sample = bbc_dataset['train'][0]
print("\nSample entry:")
print(f"File path: {sample['File_path']}")
print(f"News (first 200 chars): {sample['Articles'][:200]}...")
print(f"Summary (first 200 chars): {sample['Summaries'][:200]}...")

# 3. Prepare the Dataset for Instruction Fine-Tuning
# --------------------------------------------------

# Define a list of instruction templates for variety
instruction_templates = [
    "Summarize the following news article.",
    "Create a concise summary of this news piece.",
    "What are the key points from this article?",
    "Write a brief summary of the following news article.",
    "Generate a summary that captures the main points of this news article."
]

def format_instruction_example(example):
    """Format a news-summary pair into instruction format"""
    # Randomly select an instruction from templates
    instruction = random.choice(instruction_templates)

    formatted_text = f"""Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{example['Articles']}

### Response:
{example['Summaries']}"""

    return {"formatted_text": formatted_text}

# Using 80% for training and 20% for testing
train_testvalid = bbc_dataset['train'].train_test_split(test_size=0.2, seed=SEED)
train_dataset = train_testvalid['train']
test_dataset = train_testvalid['test']
sample = train_dataset[0]  # Sample from the training set
print("\nSample entry:")
print(f"File path: {sample['File_path']}")
print(f"News (first 200 chars): {sample['Articles'][:200]}...")
print(f"Summary (first 200 chars): {sample['Summaries'][:200]}...")

# ... rest of your code ...

# Format the dataset with instructions
train_formatted = train_dataset.map(format_instruction_example)
test_formatted = test_dataset.map(format_instruction_example)


Using device: cpu
DatasetDict({
    train: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 2224
    })
})

Sample entry:
File path: politics
News (first 200 chars): Budget to set scene for election..Gordon Brown will seek to put the economy at the centre of Labour's bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to str...
Summary (first 200 chars): - Increase in the stamp duty threshold from £60,000 - A freeze on petrol duty - An extension of tax credit scheme for poorer families - Possible help for pensioners The stamp duty threshold rise is in...

Sample entry:
File path: entertainment
News (first 200 chars): Angels 'favourite funeral song'..Angels by Robbie Williams is the song Britons would most like played at their funeral, a survey has suggested...While the melancholy hit topped the UK poll, Europeans ...
Summary (first 200 chars): Queen's Who Wants to Live Forever was highly favoured by both UK an

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

In [12]:

# Display a formatted example
print("\nFormatted example:")
print(train_formatted[0]['formatted_text'][:500] + "...")



Formatted example:
Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
Summarize the following news article.

### Input:
Angels 'favourite funeral song'..Angels by Robbie Williams is the song Britons would most like played at their funeral, a survey has suggested...While the melancholy hit topped the UK poll, Europeans favoured Queen's more upbeat anthem The Show Must Go On as their first choice. Frank Sinatra's My Way was second in the UK vot...


In [13]:





# 4. Load Pre-trained Model and Tokenizer
# ---------------------------------------

# For this example, we'll use GPT-2. You can replace with other models as needed.
# GPT-2 small is more manageable for fine-tuning on limited resources
model_name = "gpt2"  # alternatives: "gpt2-medium", "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

# 5. Tokenize the Dataset
# ----------------------

# Set maximum length for the tokenizer
max_length = 1024  # Adjust based on your GPU memory and article lengths

def tokenize_function(examples):
    """Tokenize the formatted text and prepare for training"""
    tokenized = tokenizer(
        examples["formatted_text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Create labels for causal language modeling (shifted input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized

# Tokenize datasets
train_tokenized = train_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=train_formatted.column_names
)

test_tokenized = test_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=test_formatted.column_names
)

# Convert to PyTorch datasets
train_dataset = train_tokenized.with_format("torch")
test_dataset = test_tokenized.with_format("torch")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded: gpt2
Model parameters: 124,439,808


Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Train dataset size: 1779
Test dataset size: 445


In [None]:
# 6. Set up Training Arguments and Trainer
# ---------------------------------------

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=1000,
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    save_total_limit=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="none",  # Disable wandb, etc.
    eval_strategy="steps",

)

# Create a data collator that handles padding and prepares batches
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal language modeling, not masked
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

# 7. Fine-tune the Model
# ---------------------

print("Starting fine-tuning...")
trainer.train()

# 8. Save the Fine-tuned Model
# ---------------------------

model_save_path = "./bbc_news_summary_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")



Starting fine-tuning...


In [None]:
# 9. Evaluate the Model
# -------------------

# Create a function to generate summaries with our fine-tuned model
def generate_summary(news_article, model, tokenizer, max_new_tokens=150):
    # Format the prompt
    prompt = f"""Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
Summarize the following news article.

### Input:
{news_article}

### Response:
"""
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate summary
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            max_new_tokens=max_new_tokens,
            num_beams=4,
            no_repeat_ngram_size=3,
            temperature=0.7,
            top_p=0.9
        )

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract just the response part
    response = generated_text.split("### Response:")[1].strip()

    return response

# Move model to device
model.to(device)

# Evaluate on a few test examples
print("\nEvaluating model on test examples:")
rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')

# Number of examples to evaluate
num_eval_examples = 5
eval_results = []

for i in range(min(num_eval_examples, len(test_dataset))):
    # Get the example
    example = bbc_dataset['test'][i]
    news_article = example['news']
    reference_summary = example['summary']

    # Generate summary
    generated_summary = generate_summary(news_article, model, tokenizer)

    # Calculate ROUGE scores
    rouge_scores = rouge.compute(
        predictions=[generated_summary],
        references=[reference_summary],
        use_stemmer=True
    )

    # Calculate BLEU score
    bleu_score = bleu.compute(
        predictions=[generated_summary.split()],
        references=[[reference_summary.split()]]
    )

    # Store results
    eval_results.append({
        'news_excerpt': news_article[:200] + "...",
        'reference_summary': reference_summary,
        'generated_summary': generated_summary,
        'rouge1': rouge_scores['rouge1'],
        'rouge2': rouge_scores['rouge2'],
        'rougeL': rouge_scores['rougeL'],
        'bleu': bleu_score['bleu']
    })

    # Print the results
    print(f"\nExample {i+1}:")
    print(f"News (excerpt): {news_article[:200]}...")
    print(f"Reference Summary: {reference_summary}")
    print(f"Generated Summary: {generated_summary}")
    print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
    print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
    print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")
    print(f"BLEU: {bleu_score['bleu']:.4f}")

# Save evaluation results
with open("evaluation_results.json", "w") as f:
    json.dump(eval_results, f, indent=4)



In [None]:
# 10. Interactive Testing of the Model
# ----------------------------------

def interactive_summary():
    """Allow user to input a news article and get a summary"""
    print("\nEnter a news article to summarize (type 'quit' to exit):")
    while True:
        news_article = input("\nNews Article: ")
        if news_article.lower() == 'quit':
            break

        if news_article.strip() == '':
            print("Please enter a news article.")
            continue

        summary = generate_summary(news_article, model, tokenizer, max_new_tokens=150)
        print("\nGenerated Summary:")
        print(summary)



In [None]:
# Run interactive testing
print("\nStarting interactive testing mode:")
