In [1]:
# !pip install -U transformers
# !pip install -U datasets
# !pip install tensorboard
# !pip install sentencepiece
# !pip install accelerate

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    AutoModelForCausalLM
)

import os

## Dataset Preparation

In [3]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


In [4]:
print(dataset_train[0])

{'File_path': 'politics', 'Articles': 'Borders rail link campaign rally..Campaigners are to stage a rally calling for a Borders rail link which was closed in 1969 to be reopened...They will mark the 36th anniversary of the line closure, which ran from Edinburgh through the Borders and on to Carlisle, with a walk at Tweedbank. Anne Borthwick, of Campaign for Borders Rail, said reopening the Waverley Line would restore the area\'s prosperity. MSPs are considering the reintroduction of passenger rail services through Midlothian to the Borders. Campaigners have said that reopening the Waverley Line, which could cost up to £100m, would be a huge economic boost for the Borders...In 2000, Borders Council said the area\'s economy had suffered since the closure. Ms Borthwick said the lobby group was determined to keep the pressure on the Scottish Executive. "We are hoping that many people will join us in a march to mark the 36th anniversary of the closure of the Waverley Line," she said. "Campa

## Configurations

In [5]:
MODEL = 'gpt2'
BATCH_SIZE = 4
NUM_PROCS = os.cpu_count()
EPOCHS = 20
OUT_DIR = 'results_gpt2_bbc_news_summary'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [7]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [8]:
text = dataset_train[10]['Articles'] + ' TL;DR: ' + dataset_train[10]['Summaries']

In [9]:
text

"IBM puts cash behind Linux push..IBM is spending $100m (£52m) over the next three years beefing up its commitment to Linux software...The cash injection will be used to help its customers use Linux on every type of device from handheld computers and phones right up to powerful servers. IBM said the money will fund a variety of technical, research and marketing initiatives to boost Linux use. IBM said it had taken the step in response to greater customer demand for the open source software...In 2004 IBM said it had seen double digit growth in the number of customers using Linux to help staff work together more closely. The money will be used to help this push towards greater collaboration and will add Linux-based elements to IBM's Workplace software. Workplace is a suite of programs and tools that allow workers to get at core business applications no matter what device they use to connect to corporate networks. One of the main focuses of the initiative will be to make it easier to use 

In [10]:
def preprocess_function(example):
    context = f"{example['Articles'] + ' TL;DR: ' + example['Summaries']}"
    final_tokens = tokenizer(context, max_length=MAX_LENGTH, padding='max_length')
    return final_tokens

In [11]:
tokenized_train = dataset_train.map(
    preprocess_function,
#     batched=True,
    num_proc=NUM_PROCS,
    remove_columns=dataset_train.column_names,
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
#     batched=True,
    num_proc=NUM_PROCS,
    remove_columns=dataset_valid.column_names,
)

Map (num_proc=16):   0%|          | 0/1779 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/445 [00:00<?, ? examples/s]

In [12]:
print(tokenized_train[0])

{'input_ids': [33, 6361, 6787, 2792, 1923, 7903, 492, 46102, 364, 389, 284, 3800, 257, 7903, 4585, 329, 257, 40934, 6787, 2792, 543, 373, 4838, 287, 16450, 284, 307, 37415, 986, 2990, 481, 1317, 262, 4570, 400, 11162, 286, 262, 1627, 16512, 11, 543, 4966, 422, 23475, 832, 262, 40934, 290, 319, 284, 8124, 20919, 11, 351, 257, 2513, 379, 24205, 276, 17796, 13, 15397, 347, 1506, 16239, 11, 286, 13718, 329, 40934, 12950, 11, 531, 302, 29443, 262, 370, 8770, 1636, 6910, 561, 11169, 262, 1989, 338, 19519, 13, 337, 4303, 82, 389, 6402, 262, 38368, 596, 286, 11849, 6787, 2594, 832, 7215, 75, 849, 666, 284, 262, 40934, 13, 13718, 364, 423, 531, 326, 302, 29443, 262, 370, 8770, 1636, 6910, 11, 543, 714, 1575, 510, 284, 4248, 3064, 76, 11, 561, 307, 257, 3236, 3034, 5750, 329, 262, 40934, 986, 818, 4751, 11, 40934, 4281, 531, 262, 1989, 338, 3773, 550, 6989, 1201, 262, 16512, 13, 6997, 347, 1506, 16239, 531, 262, 10866, 1448, 373, 5295, 284, 1394, 262, 3833, 319, 262, 11905, 10390, 13, 366, 1135,

In [13]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= MAX_LENGTH:
        total_length = (total_length // MAX_LENGTH) * MAX_LENGTH
    result = {
        k: [t[i : i + MAX_LENGTH] for i in range(0, total_length, MAX_LENGTH)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
lm_dataset_train = tokenized_train.map(
    group_texts, num_proc=NUM_PROCS, batched=True
)
lm_dataset_valid = tokenized_valid.map(
    group_texts, num_proc=NUM_PROCS, batched=True
)

Map (num_proc=16):   0%|          | 0/1779 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/445 [00:00<?, ? examples/s]

In [15]:
print(lm_dataset_train[0])

{'input_ids': [33, 6361, 6787, 2792, 1923, 7903, 492, 46102, 364, 389, 284, 3800, 257, 7903, 4585, 329, 257, 40934, 6787, 2792, 543, 373, 4838, 287, 16450, 284, 307, 37415, 986, 2990, 481, 1317, 262, 4570, 400, 11162, 286, 262, 1627, 16512, 11, 543, 4966, 422, 23475, 832, 262, 40934, 290, 319, 284, 8124, 20919, 11, 351, 257, 2513, 379, 24205, 276, 17796, 13, 15397, 347, 1506, 16239, 11, 286, 13718, 329, 40934, 12950, 11, 531, 302, 29443, 262, 370, 8770, 1636, 6910, 561, 11169, 262, 1989, 338, 19519, 13, 337, 4303, 82, 389, 6402, 262, 38368, 596, 286, 11849, 6787, 2594, 832, 7215, 75, 849, 666, 284, 262, 40934, 13, 13718, 364, 423, 531, 326, 302, 29443, 262, 370, 8770, 1636, 6910, 11, 543, 714, 1575, 510, 284, 4248, 3064, 76, 11, 561, 307, 257, 3236, 3034, 5750, 329, 262, 40934, 986, 818, 4751, 11, 40934, 4281, 531, 262, 1989, 338, 3773, 550, 6989, 1201, 262, 16512, 13, 6997, 347, 1506, 16239, 531, 262, 10866, 1448, 373, 5295, 284, 1394, 262, 3833, 319, 262, 11905, 10390, 13, 366, 1135,

## Model

In [16]:
model = AutoModelForCausalLM.from_pretrained(MODEL)

In [17]:
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

124,439,808 total parameters.
124,439,808 training parameters.


## Training

In [18]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to='tensorboard',
    learning_rate=0.00001,
    fp16=True,
    dataloader_num_workers=NUM_PROCS
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_valid,
)

history = trainer.train()

Epoch,Training Loss,Validation Loss
1,3.1052,2.861994
2,2.9463,2.806295
3,2.7563,2.77892
4,2.7497,2.760299
5,2.7744,2.749584
6,2.6583,2.74232
7,2.502,2.737483
8,2.5637,2.734116
9,2.5941,2.729765
10,2.6057,2.728679


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [19]:
model.save_pretrained(f"{OUT_DIR}/final_model")

In [20]:
tokenizer.save_pretrained(f"{OUT_DIR}/final_model")

('results_gpt2_bbc_news_summary/final_model/tokenizer_config.json',
 'results_gpt2_bbc_news_summary/final_model/special_tokens_map.json',
 'results_gpt2_bbc_news_summary/final_model/vocab.json',
 'results_gpt2_bbc_news_summary/final_model/merges.txt',
 'results_gpt2_bbc_news_summary/final_model/added_tokens.json',
 'results_gpt2_bbc_news_summary/final_model/tokenizer.json')

## Inference

In [21]:
from transformers import AutoModelForCausalLM, AutoTokenizer

import torch

In [22]:
model_path = 'results_gpt2_bbc_news_summary/final_model'
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [23]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [24]:
prompt = """American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.

Negotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.

President Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, Egyptian and Qatari officials. If Mr. Burns makes enough progress, Mr. Biden may then send his Middle East coordinator, Brett McGurk, who just returned to Washington, back to the region to help finalize the agreement.

“Both leaders affirmed that a hostage deal is central to establishing a prolonged humanitarian pause in the fighting and ensure additional lifesaving humanitarian assistance reaches civilians in need throughout Gaza,” the White House said in a statement Friday night summarizing the president’s conversation with Sheikh Mohammed bin Abdulrahman  al-Thani, Qatar’s prime minister. “They underscored the urgency of the situation and welcomed the close cooperation among their teams to advance recent discussions.”

In a statement in Israel on Saturday, Prime Minister Benjamin Netanyahu reaffirmed his commitment to securing the release of those hostages who were not freed as part of a more limited agreement in November. “As of today, we have returned 110 of our hostages and we are committed to returning all of them home,” he said. “We are dealing with this and we are doing so around the clock, including now.”"""

In [25]:
print(prompt)

American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.

Negotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.

President Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, Egypti

In [26]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        text + ' TL;DR: ',
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    with torch.no_grad():
        # Generate the summary
        summary_ids = model.generate(
            inputs,
            max_length=512,
            num_beams=num_beams,
            early_stopping=True,
        )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [27]:
summarize_text(prompt, model, tokenizer, num_beams=5)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'American-led negotiators are edging closer to an agreement in which Israel would suspend its war in Gaza for about two months in exchange for the release of more than 100 hostages still held by Hamas, a deal that could be sealed in the next two weeks and would transform the conflict consuming the region.\n\nNegotiators have developed a written draft agreement merging proposals offered by Israel and Hamas in the last 10 days into a basic framework that will be the subject of talks in Paris on Sunday. While there are still important disagreements to be worked out, negotiators are cautiously optimistic that a final accord is within reach, according to U.S. officials who insisted on anonymity to discuss sensitive talks.\n\nPresident Biden spoke by phone separately Friday with the leaders of Egypt and Qatar, who have served as intermediaries with Hamas, to narrow the remaining differences. He is also sending his C.I.A. director, William J. Burns, to Paris for Sunday’s talks with Israeli, E