## Setup

In [1]:
!pip install -U datasets transformers trl



## Imports

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer
)
from trl import SFTTrainer

import os
import torch

## Configuration

In [3]:
batch_size = 1
num_workers = os.cpu_count()
epochs = 5
bf16 = True
fp16 = False
gradient_accumulation_steps = 8
context_length = 1024
learning_rate = 0.0002
model_name = 'facebook/opt-350m'
out_dir = 'outputs/opt_350m_summarizer'
seed = 42

## Dataset Preparation

In [4]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=seed)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']
 
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


In [5]:
print(dataset_train[0])

{'File_path': 'entertainment', 'Articles': 'Angels \'favourite funeral song\'..Angels by Robbie Williams is the song Britons would most like played at their funeral, a survey has suggested...While the melancholy hit topped the UK poll, Europeans favoured Queen\'s more upbeat anthem The Show Must Go On as their first choice. Frank Sinatra\'s My Way was second in the UK vote with Monty Python\'s Always Look on the Bright Side of Life in third place. More than 45,000 people were surveyed by digital TV station Music Choice...The European chart, which included Denmark, France and Germany, put Led Zeppelin\'s Stairway to Heaven in second and AC/DC\'s Highway to Hell in third. Queen\'s Who Wants to Live Forever was highly favoured by both UK and European voters...Both lists featured only one traditional or classic song each, with Britons requesting the Royal Scots Dragoon Guards\' Amazing Grace and their continental counterparts opting for Mozart\'s Requiem. "Wanting to share your most treasu

In [6]:
# def preprocess_function(example):
#     text = f"### ARTICLE:\n{example['Articles']}\n### SUMMARY:\n{example['Summaries']}\n"
#     return text

def preprocess_function(example):
    output_text = []
    for i in range(len(example['Articles'])):
        text = f"### Instruction:\nSummarize the following article.\n\n### Input:\n{example['Articles'][i]}\n\n### Response:\n{example['Summaries'][i]}</s>\n"
        output_text.append(text)
    return output_text

## Model

In [7]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)



In [8]:
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

## Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)

In [10]:
tokenizer.add_eos_token = True

In [11]:
tokenizer.bos_token = '<s>'

In [12]:
print(tokenizer)

GPT2Tokenizer(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


## Training

In [13]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='epoch',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    num_train_epochs=epochs,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
    seed=seed
)

In [14]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_train,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function
)

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

In [15]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

<s>### Instruction:
Summarize the following article.

### Input:
PC photo printers challenge pros..Home printed pictures can be cheaper and higher quality than those from High Street developers, tests shows...A survey carried out by PC Pro magazine looked at which of 100 home photo printers offered a better deal than handing your snaps to a photo lab. The tests found that images from top PC printers kept their colour longer than professionally produced photographs. But using the wrong printer cartridge could means snaps fade in months, warned the magazine...The group test of 100 home photo printers for PCs discovered how much it costs to create images using the devices compared to online developers as well as High Street names such as Jessops, Boots and Snappy Snaps. The comprehensive test also revealed how quickly different printers produced images, the quality of the finished image and how resistant finished pictures were to smudging or water damage. It found that although some ink f

## Train

In [16]:
history = trainer.train()

Epoch,Training Loss,Validation Loss
0,2.3277,1.710479
1,1.8077,1.320235
2,1.4368,1.006781
3,1.1417,0.741407
4,0.8846,0.541737


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [17]:
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/opt_350m_summarizer/best_model/tokenizer_config.json',
 'outputs/opt_350m_summarizer/best_model/special_tokens_map.json',
 'outputs/opt_350m_summarizer/best_model/vocab.json',
 'outputs/opt_350m_summarizer/best_model/merges.txt',
 'outputs/opt_350m_summarizer/best_model/added_tokens.json')

## Inference

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

import torch

In [19]:
model = AutoModelForCausalLM.from_pretrained(
    'outputs/opt_350m_summarizer/best_model',
    device_map='cuda'
)
tokenizer = AutoTokenizer.from_pretrained('outputs/opt_350m_summarizer/best_model/')

In [20]:
def post_process(text):
    return ''.join(text.split('### SUMMARY:')[0])

In [26]:
prompt = """<s>### Instruction:
Summarize the following article.

### Input:
George Russell and Fernando Alonso have each offered their take on the incident that preceded Russell’s dramatic crash on the penultimate lap of the Australian Grand Prix following their battle for P6, with the Spaniard subsequently hit with a 20-second penalty after the race.

Russell had been chasing the Aston Martin for several laps following his final pit stop but, after getting close towards the Turn 6/7 complex, lost control of his Mercedes and hit the barriers, with the W15 then ricocheting back onto the track and ending up on its side.

READ MORE: Alonso hit with post-race time penalty in Australia over ‘potentially dangerous’ driving before Russell crash

While Russell fortunately reported that he was unharmed in the incident, it ultimately ended what had been a tough day for the Silver Arrows following Lewis Hamilton’s earlier retirement due to a mechanical issue.

It was confirmed after the race that both Russell and Alonso had been summoned to the stewards over the incident, with the Aston Martin man hit with a 20-second penalty for what the stewards deemed was "potentially dangerous" driving. Speaking before the hearing, Alonso gave his version of events during a conversation on Sky Sports.

“Well, obviously I was focusing in front of me and not behind,” the Spaniard explained. “I had some issues for the last 15 laps, something on the battery on the deployment, so definitely I was struggling a little bit at the end of the race, but yeah, I cannot focus on the cars behind. But he’s okay apparently, I saw the car and I was very worried.”

### Response:
"""

In [27]:
print(prompt)

<s>### Instruction:
Summarize the following article.

### Input:
George Russell and Fernando Alonso have each offered their take on the incident that preceded Russell’s dramatic crash on the penultimate lap of the Australian Grand Prix following their battle for P6, with the Spaniard subsequently hit with a 20-second penalty after the race.

Russell had been chasing the Aston Martin for several laps following his final pit stop but, after getting close towards the Turn 6/7 complex, lost control of his Mercedes and hit the barriers, with the W15 then ricocheting back onto the track and ending up on its side.

READ MORE: Alonso hit with post-race time penalty in Australia over ‘potentially dangerous’ driving before Russell crash

While Russell fortunately reported that he was unharmed in the incident, it ultimately ended what had been a tough day for the Silver Arrows following Lewis Hamilton’s earlier retirement due to a mechanical issue.

It was confirmed after the race that both Russe

In [28]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    return_full_text=False,
)
result = pipe(
    prompt,
    eos_token_id=tokenizer.eos_token_id
)

print(result[0]['generated_text'])

While Russell fortunately reported that he was unharmed in the incident, it ultimately ended what had been a tough day for the Silver Arrows following Lewis Hamilton's earlier retirement due to a mechanical issue.Speaking before the hearing, Alonso gave his version of events during a conversation on Sky Sports.Russell had been chasing the Aston Martin for several laps following his final pit stop but, after getting close towards the Turn 6/7 complex, lost control of his Mercedes and hit the barriers, with the W15 then ricocheting back onto the track and ending up on its side.Speaking after the hearing, Alonso gave his version of events during a conversation on Sky Sports.


In [24]:
# print(post_process(result[0]['generated_text']))