In [2]:
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration

## Load the PubMed Summarization dataset from Hugging Face.

In [3]:
dataset = load_dataset("ccdv/pubmed-summarization", "document")

## Explore the dataset to understand its structure and contents

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 119924
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6633
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6658
    })
})


In [5]:
print(dataset['train'][0])

{'article': "a recent systematic analysis showed that in 2011 , 314 ( 296 - 331 ) million children younger than 5 years were mildly , moderately or severely stunted and 258 ( 240 - 274 ) million were mildly , moderately or severely underweight in the developing countries . in iran a study among 752 high school girls in sistan and baluchestan showed prevalence of 16.2% , 8.6% and 1.5% , for underweight , overweight and obesity , respectively . the prevalence of malnutrition among elementary school aged children in tehran varied from 6% to 16% . anthropometric study of elementary school students in shiraz revealed that 16% of them suffer from malnutrition and low body weight . snack should have 300 - 400 kcal energy and could provide 5 - 10 g of protein / day . nowadays , school nutrition programs are running as the national programs , world - wide . national school lunch program in the united states there are also some reports regarding school feeding programs in developing countries . 

## Preprocess the dataset to clean and prepare the text for summarization

In [6]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')


def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["abstract"], max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Apply the preprocessing function to the entire dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["article", "abstract"])

# Set the format to PyTorch tensors
tokenized_datasets.set_format("torch")

# Print a sample to check the preprocessing
print(tokenized_datasets['train'][0])

{'input_ids': tensor([21603,    10,     3,     9,  1100, 20036,  1693,  3217,    24,    16,
         2722,     3,     6,   220,  2534,    41,   204,  4314,     3,    18,
          220,  3341,     3,    61,   770,   502,  5868,   145,   305,   203,
          130,  8248,   120,     3,     6,  8107,   120,    42, 20215, 27572,
           15,    26,    11,   204,  3449,    41,     3, 11944,     3,    18,
          204,  4581,     3,    61,   770,   130,  8248,   120,     3,     6,
         8107,   120,    42, 20215,   365,  9378,    16,     8,  2421,  1440,
            3,     5,    16,     3,    23,  2002,     3,     9,   810,   859,
         6374,   357,   306,   496,  3567,    16,   108,  5627,    11,  6561,
         2295,   222,   152,  3217, 24753,    13, 10128,  5406,     3,     6,
         4848,  6370,    11,  1300,  2712,     3,     6,    21,   365,  9378,
            3,     6, 26676,    11, 18719,     3,     6,  6898,     3,     5,
            8, 24753,    13,  1460,  8631,  1575, 

## T5 Model for Text Summarization

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
import accelerate

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=0.001,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



## Summary Generator

In [None]:
# Define a function to summarize text
def summarize(text, model, tokenizer):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, length_penalty=2.0, max_length=150, min_length=40, no_repeat_ngram_size=3, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Test the function with an example article
example_article = input("Enter the article: ")
summary = summarize(example_article, model, tokenizer)
print(summary)