In [1]:
!pip install -q datasets torch tqdm matplotlib tiktoken evaluate nltk accelerate
!pip install -q huggingface_hub

In [2]:
pip install transformers==4.38.0

Note: you may need to restart the kernel to use updated packages.


In [3]:
# import os
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    get_scheduler,
    DataCollatorForLanguageModeling,

)

from datasets import load_dataset, Dataset as HFDataset
import evaluate


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SEED = 193
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Check if GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


In [5]:
bbc_dataset = load_dataset("gopalkalpande/bbc-news-summary")
print(bbc_dataset)

DatasetDict({
    train: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 2224
    })
})


In [6]:
sample = bbc_dataset['train'][0]
print("\nSample entry:")
print(f"File path: {sample['File_path']}")
print(f"News (first 200 chars): {sample['Articles'][:200]}...")
print(f"Summary (first 200 chars): {sample['Summaries'][:200]}...")


Sample entry:
File path: politics
News (first 200 chars): Budget to set scene for election..Gordon Brown will seek to put the economy at the centre of Labour's bid for a third term in power when he delivers his ninth Budget at 1230 GMT. He is expected to str...
Summary (first 200 chars): - Increase in the stamp duty threshold from £60,000 - A freeze on petrol duty - An extension of tax credit scheme for poorer families - Possible help for pensioners The stamp duty threshold rise is in...


In [7]:
instruction_templates = [
    "Summarize the following news article.",
    "Create a concise summary of this news piece.",
    "What are the key points from this article?",
    "Write a brief summary of the following news article.",
    "Generate a summary that captures the main points of this news article."
]
def format_instruction_example(example):
    """Format a news-summary pair into instruction format"""
    # Randomly select an instruction from templates
    instruction = random.choice(instruction_templates)

    formatted_text = f"""Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{example['Articles']}

### Response:
{example['Summaries']}"""

    return {"formatted_text": formatted_text}
# Format the dataset with instructions
train_formatted = bbc_dataset['train'].map(format_instruction_example)
# The dataset likely doesn't have a separate 'test' split.
# Instead, split the 'train' data into train and test sets.
train_test_split = train_formatted.train_test_split(test_size=0.2, seed=SEED)
train_formatted = train_test_split['train']
test_formatted = train_test_split['test']

# Display a formatted example
print("\nFormatted example:")
print(train_formatted[0]['formatted_text'][:500] + "...")


Formatted example:
Below is an instruction that describes a task.
Write a response that appropriately completes the request.

### Instruction:
Summarize the following news article.

### Input:
Commodore finds new lease of life..The once-famous Commodore computer brand could be resurrected after being bought by a US-based digital music distributor...New owner Yeahronimo Media Ventures has not ruled out the possibility of a new breed of Commodore computers. It also plans to develop a "worldwide entertainment concept...


In [8]:
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")




Model loaded: EleutherAI/gpt-neo-125M
Model parameters: 125,198,592


In [9]:
max_length = 128  # Adjust based on your GPU memory and article lengths
def tokenize_function(examples):
    """Tokenize the formatted text and prepare for training"""
    tokenized = tokenizer(
        examples["formatted_text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt"
    )

    # Create labels for causal language modeling (shifted input_ids)
    tokenized["labels"] = tokenized["input_ids"].clone()

    return tokenized


In [10]:
train_tokenized = train_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=train_formatted.column_names
)

In [11]:
test_tokenized = test_formatted.map(
    tokenize_function,
    batched=True,
    remove_columns=test_formatted.column_names
)

In [12]:
train_dataset = train_tokenized.with_format("torch")
test_dataset = test_tokenized.with_format("torch")

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 1779
Test dataset size: 445


In [13]:
import transformers
print(transformers.__version__)


4.38.0


In [18]:
from transformers import TrainingArguments, Trainer
from accelerate import Accelerator, DataLoaderConfiguration

# Define DataLoaderConfiguration
dataloader_config = DataLoaderConfiguration(
    dispatch_batches=False,  # Each process fetches its own batch
    split_batches=True       # Split fetched batches across processes
)

# Initialize Accelerator with DataLoaderConfiguration
accelerator = Accelerator(dataloader_config=dataloader_config)
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    per_device_eval_batch_size=4,
    eval_steps=500,  # Evaluate every eval_steps
    save_steps=1000,  # Save every 1000 steps
    warmup_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",  # Evaluate every eval_steps
    save_strategy="steps",  # Save every save_steps
    load_best_model_at_end=True,
    save_total_limit=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    report_to="none",  # Disable wandb, etc.
)

# Create a data collator that handles padding and prepares batches
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # We're doing causal language modeling, not masked
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)


TypeError: Accelerator.__init__() got an unexpected keyword argument 'dispatch_batches'