In [1]:
# !pip install -U transformers
# !pip install -U datasets
# !pip install tensorboard
# !pip install sentencepiece
# !pip install accelerate

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM
)

import os

## Dataset Preparation

In [3]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


In [4]:
print(dataset_train[0])

{'File_path': 'sport', 'Articles': 'Bellamy fined after row..Newcastle have fined their Welsh striker Craig Bellamy two weeks\' wages - about £80,000 - following his row with manager Graeme Souness...But Bellamy, 25, has not been put on the transfer list, although he did not train with the first team on Tuesday. Magpies chairman Freddy Shepherd told the Newcastle Evening Chronicle: "It is not about money. It is about a player thinking he is bigger than this club. "No individual is, be it the chairman, the manager or a player." Souness dropped Bellamy for Sunday\'s game against Arsenal, claiming the Welshman had feigned injury after being asked to play out of position. "When I heard what the manager was saying I was in shock," Bellamy said. "I thought \'not only has he gone behind my back, he\'s lying\'," he said in response to Souness\' remarks. And the Wales international refused to apologise. "I won\'t apologise because I have done nothing wrong," he told the Evening Chronicle. "Ther

## Configurations

In [5]:
MODEL = 'gpt2-medium'
BATCH_SIZE = 1
NUM_PROCS = os.cpu_count()
EPOCHS = 3
OUT_DIR = 'results_gpt2_medium_bbc_news_summary'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [7]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [8]:
text = dataset_train[10]['Articles'] + ' TL;DR: ' + dataset_train[10]['Summaries']

In [9]:
text

'Gamer buys $26,500 virtual land..A 22-year-old gamer has spent $26,500 (£13,700) on an island that exists only in a computer role-playing game (RPG)...The Australian gamer, known only by his gaming moniker Deathifier, bought the island in an online auction. The land exists within the game Project Entropia, an RPG which allows thousands of players to interact with each other. Entropia allows gamers to buy and sell virtual items using real cash, while fans of other titles often use auction site eBay to sell their virtual wares. Earlier this year economists calculated that these massively multi-player online role-playing games (MMORPGs) have a gross economic impact equivalent to the GDP of the African nation of Namibia..."This is a historic moment in gaming history, and this sale only goes to prove that massive multi-player online gaming has reached a new plateau," said Marco Behrmann, director of community relations at Mindark, the game\'s developer...The virtual island includes a gigan

In [10]:
def preprocess_function(example):
    context = f"{example['Articles'] + ' TL;DR: ' + example['Summaries']}"
    final_tokens = tokenizer(context, max_length=MAX_LENGTH, padding='max_length')
    return final_tokens

In [11]:
tokenized_train = dataset_train.map(
    preprocess_function,
#     batched=True,
    num_proc=NUM_PROCS,
    remove_columns=dataset_train.column_names,
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
#     batched=True,
    num_proc=NUM_PROCS,
    remove_columns=dataset_valid.column_names,
)

Map (num_proc=16):   0%|          | 0/1779 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/445 [00:00<?, ? examples/s]

In [12]:
print(tokenized_train[0])

{'input_ids': [36488, 14814, 22643, 706, 5752, 492, 3791, 18676, 423, 22643, 511, 22945, 19099, 13854, 7459, 14814, 734, 2745, 6, 9400, 532, 546, 4248, 1795, 11, 830, 532, 1708, 465, 5752, 351, 4706, 7037, 34755, 311, 977, 408, 986, 1537, 7459, 14814, 11, 1679, 11, 468, 407, 587, 1234, 319, 262, 4351, 1351, 11, 3584, 339, 750, 407, 4512, 351, 262, 717, 1074, 319, 3431, 13, 2944, 79, 444, 8900, 45437, 30890, 1297, 262, 22410, 31867, 14160, 25, 366, 1026, 318, 407, 546, 1637, 13, 632, 318, 546, 257, 2137, 3612, 339, 318, 5749, 621, 428, 3430, 13, 366, 2949, 1981, 318, 11, 307, 340, 262, 8900, 11, 262, 4706, 393, 257, 2137, 526, 311, 977, 408, 5710, 7459, 14814, 329, 3502, 338, 983, 1028, 13837, 11, 8512, 262, 22945, 805, 550, 730, 3916, 5095, 706, 852, 1965, 284, 711, 503, 286, 2292, 13, 366, 2215, 314, 2982, 644, 262, 4706, 373, 2282, 314, 373, 287, 6380, 553, 7459, 14814, 531, 13, 366, 40, 1807, 705, 1662, 691, 468, 339, 3750, 2157, 616, 736, 11, 339, 338, 9105, 40264, 339, 531, 287, 2

In [13]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= MAX_LENGTH:
        total_length = (total_length // MAX_LENGTH) * MAX_LENGTH
    result = {
        k: [t[i : i + MAX_LENGTH] for i in range(0, total_length, MAX_LENGTH)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
lm_dataset_train = tokenized_train.map(
    group_texts, num_proc=NUM_PROCS, batched=True
)
lm_dataset_valid = tokenized_valid.map(
    group_texts, num_proc=NUM_PROCS, batched=True
)

Map (num_proc=16):   0%|          | 0/1779 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/445 [00:00<?, ? examples/s]

In [15]:
print(lm_dataset_train[0])

{'input_ids': [36488, 14814, 22643, 706, 5752, 492, 3791, 18676, 423, 22643, 511, 22945, 19099, 13854, 7459, 14814, 734, 2745, 6, 9400, 532, 546, 4248, 1795, 11, 830, 532, 1708, 465, 5752, 351, 4706, 7037, 34755, 311, 977, 408, 986, 1537, 7459, 14814, 11, 1679, 11, 468, 407, 587, 1234, 319, 262, 4351, 1351, 11, 3584, 339, 750, 407, 4512, 351, 262, 717, 1074, 319, 3431, 13, 2944, 79, 444, 8900, 45437, 30890, 1297, 262, 22410, 31867, 14160, 25, 366, 1026, 318, 407, 546, 1637, 13, 632, 318, 546, 257, 2137, 3612, 339, 318, 5749, 621, 428, 3430, 13, 366, 2949, 1981, 318, 11, 307, 340, 262, 8900, 11, 262, 4706, 393, 257, 2137, 526, 311, 977, 408, 5710, 7459, 14814, 329, 3502, 338, 983, 1028, 13837, 11, 8512, 262, 22945, 805, 550, 730, 3916, 5095, 706, 852, 1965, 284, 711, 503, 286, 2292, 13, 366, 2215, 314, 2982, 644, 262, 4706, 373, 2282, 314, 373, 287, 6380, 553, 7459, 14814, 531, 13, 366, 40, 1807, 705, 1662, 691, 468, 339, 3750, 2157, 616, 736, 11, 339, 338, 9105, 40264, 339, 531, 287, 2

## Model

In [16]:
model = AutoModelForCausalLM.from_pretrained(MODEL)

In [17]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

354,823,168 total parameters.
354,823,168 training parameters.


## Training

In [18]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to='tensorboard',
    learning_rate=0.00001,
    fp16=True,
    dataloader_num_workers=NUM_PROCS
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_valid,
)

history = trainer.train()

Epoch,Training Loss,Validation Loss
1,2.7848,2.503491
2,2.6709,2.469852
3,2.4074,2.466026


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [19]:
model.save_pretrained(f"{OUT_DIR}/final_model")

In [20]:
tokenizer.save_pretrained(f"{OUT_DIR}/final_model")

('results_gpt2_bbc_news_summary/final_model/tokenizer_config.json',
 'results_gpt2_bbc_news_summary/final_model/special_tokens_map.json',
 'results_gpt2_bbc_news_summary/final_model/vocab.json',
 'results_gpt2_bbc_news_summary/final_model/merges.txt',
 'results_gpt2_bbc_news_summary/final_model/added_tokens.json',
 'results_gpt2_bbc_news_summary/final_model/tokenizer.json')

## Inference

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch

In [22]:
model_path = 'results_gpt2_medium_bbc_news_summary/final_model'
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [24]:
prompt = """American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.

Negotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.

President Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, Egyptian and Qatari officials. If Mr. Burns makes enough progress, Mr. Biden may then send his Middle East coordinator, Brett McGurk, who just returned to Washington, back to the region to help finalize the agreement.

“Both leaders affirmed that a hostage deal is central to establishing a prolonged humanitarian pause in the fighting and ensure additional lifesaving humanitarian assistance reaches civilians in need throughout Gaza,” the White House said in a statement Friday night summarizing the president’s conversation with Sheikh Mohammed bin Abdulrahman  al-Thani, Qatar’s prime minister. “They underscored the urgency of the situation and welcomed the close cooperation among their teams to advance recent discussions.”

In a statement in Israel on Saturday, Prime Minister Benjamin Netanyahu reaffirmed his commitment to securing the release of those hostages who were not freed as part of a more limited agreement in November. “As of today, we have returned 110 of our hostages and we are committed to returning all of them home,” he said. “We are dealing with this and we are doing so around the clock, including now.”"""

In [25]:
print(prompt)

American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.

Negotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.

President Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, Egypti

In [26]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        text + ' TL;DR: ',
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    with torch.no_grad():
        # Generate the summary
        summary_ids = model.generate(
            inputs,
            max_length=512,
            num_beams=num_beams,
            early_stopping=True,
        )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [28]:
summarize_text(prompt, model, tokenizer, num_beams=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.\n\nNegotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.\n\nPresident Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, E