## Importing the required packages

In [None]:
pip install transformers

In [None]:
!pip install SentencePiece

In [None]:
!pip install datasets

# Initializing the Pegasus pre-trained model

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_ckpt = "google/pegasus-cnn_dailymail"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

# Loading the training data for fine-tuning

In [None]:
import pandas as pd
news = pd.read_excel("/content/Inshorts Cleaned Data.xlsx",engine = 'openpyxl')
news.drop(['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)          # dropping unwanted columns
news.head()

Unnamed: 0,Headline,Short
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a..."
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...


In [None]:
news['id'] = range(1,len(news)+1)
news.tail()

Unnamed: 0,Headline,Short,id
55099,Sensex loses 400 points to hit 52-week low,"Tracking weak cues from the Asian markets, the...",55100
55100,China to inject $91 bn into the money markets,Amid growing concerns about China&#39;s econom...,55101
55101,Ghulam Ali set to make acting debut in Bollywood,Pakistani Ghazal singer Ghulam Ali will soon m...,55102
55102,IS acknowledges death of Jihadi John: Report,The Islamic State (IS) has acknowledged the de...,55103
55103,Cairn to seek $600 mn from India in damages,UK-based oil firm Cairn Energy on Tuesday said...,55104


In [None]:
news.head()

Unnamed: 0,Headline,Short,id
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...,1
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...,2
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a...",3
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...,4
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...,5


## Converting from DataFrame to Dataset format

In [None]:
from datasets import Dataset
data = Dataset.from_pandas(news)

## Train-test split

In [None]:
usable_data = data.train_test_split(test_size=0.2)
usable_data

DatasetDict({
    train: Dataset({
        features: ['Headline', 'Short', 'id'],
        num_rows: 44083
    })
    test: Dataset({
        features: ['Headline', 'Short', 'id'],
        num_rows: 11021
    })
})

## Printing data to check

In [None]:
split_lengths = [len(usable_data[split])for split in usable_data]

print(f"Split lengths: {split_lengths}")
print(f"Features: {usable_data['train'].column_names}")
print("\Headline:")

print(usable_data["test"][0]["Headline"])

print("\nSummary:")

print(usable_data["test"][0]["Short"])

Split lengths: [44083, 11021]
Features: ['Headline', 'Short', 'id']
\Headline:
Trailer of Daniel Radcliffe&#39;s &#39;Imperium&#39; out

Summary:
The official trailer of the Daniel Radcliffe starrer thriller film &#39;Imperium&#39; was released on Tuesday. Directed by Daniel Ragussis, the film features Daniel as an FBI agent who goes undercover to take down a radical right-wing terrorist group. Also starring Tony Collette and Tracy Letts, the film is scheduled for a limited release on August 19. 


## Tokenization

### Inbuilt processes that take place:
i) Normalization

ii) Pre-tokenization

iii) Tokenization

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['Headline'] , max_length = 1024, truncation = True )

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['Short'], max_length = 128, truncation = True )

    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

custom_data = usable_data.map(convert_examples_to_features, batched = True)

Map:   0%|          | 0/44083 [00:00<?, ? examples/s]



Map:   0%|          | 0/11021 [00:00<?, ? examples/s]

## Dynamic padding of input tokens and lables using DataCollatorForSeq2Seq

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

## TrainingArguments is used to define and customize the training arguments for fine-tuning of a transformer model

In [None]:
from transformers import TrainingArguments, Trainer

trainer_args = TrainingArguments(
    output_dir='pegasus-news',
    num_train_epochs=1,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_steps=100,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1e6,
    gradient_accumulation_steps=4
)

## Trainer is used to facilitate the training process of transformers
## (Which is in our case is to fine-tune the Pegasus transformer)

In [None]:
trainer = Trainer(model=model_pegasus, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=custom_data["train"],
                  eval_dataset=custom_data["test"])

In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.9618,2.690811
1000,2.793,2.569863


TrainOutput(global_step=1377, training_loss=2.977160285500919, metrics={'train_runtime': 3788.8962, 'train_samples_per_second': 11.635, 'train_steps_per_second': 0.363, 'total_flos': 1961178823262208.0, 'train_loss': 2.977160285500919, 'epoch': 1.0})

## Saving the fine-tuned model in our local directory

In [None]:
trainer.save_model("/content/mymodel")

In [None]:
pip install huggingface_hub




## Logging into our Huggingface account

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


## Creating a repository to store our model

In [None]:
!huggingface-cli repo create snp_model

[90mgit version 2.34.1[0m
[90mgit-lfs/3.0.2 (GitHub; linux amd64; go 1.18.1)[0m

You are about to create [1mHeavycoder/snp_model[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/Heavycoder/snp_model[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/Heavycoder/snp_model



## Uploading our fine-tuned model to the huggingface repository

In [None]:
!huggingface-cli upload --repo-type model Heavycoder/snp_model '/content/mymodel'

spiece.model:   0% 0.00/1.91M [00:00<?, ?B/s]
pytorch_model.bin:   0% 0.00/2.28G [00:00<?, ?B/s][A

Upload 3 LFS files:   0% 0/3 [00:00<?, ?it/s][A[A


training_args.bin:   0% 0.00/4.47k [00:00<?, ?B/s][A[A[A
spiece.model:   1% 16.4k/1.91M [00:00<00:37, 50.2kB/s]


training_args.bin: 100% 4.47k/4.47k [00:00<00:00, 11.4kB/s]

spiece.model: 100% 1.91M/1.91M [00:00<00:00, 3.19MB/s]

pytorch_model.bin:   1% 16.0M/2.28G [00:00<01:46, 21.2MB/s][A
pytorch_model.bin:   1% 24.9M/2.28G [00:00<01:09, 32.4MB/s][A
pytorch_model.bin:   1% 32.0M/2.28G [00:01<01:45, 21.4MB/s][A
pytorch_model.bin:   2% 41.4M/2.28G [00:01<01:13, 30.4MB/s][A
pytorch_model.bin:   2% 48.0M/2.28G [00:02<01:28, 25.3MB/s][A
pytorch_model.bin:   3% 57.3M/2.28G [00:02<01:05, 33.9MB/s][A
pytorch_model.bin:   3% 64.0M/2.28G [00:02<01:12, 30.8MB/s][A
pytorch_model.bin:   3% 74.2M/2.28G [00:02<00:53, 41.3MB/s][A
pytorch_model.bin:   4% 80.3M/2.28G [00:02<01:03, 34.6MB/s][A
pytorch_model.bin:   4% 94.7M/2.28G [00:0

## Testing our fine-tuned model

In [None]:
input_text = """Two workers at the tsunami-wrecked Fukushima Daiichi nuclear power plant were hospitalized after accidentally getting sprayed with liquid laced with radioactive materials, officials said Thursday.
The incident occurred on Wednesday when a group of workers was cleaning the piping at the Advanced Liquid Processing System. The ALPS is a wastewater filtering facility that is key to the treatment of the radioactive wastewater that accumulates on the plant and its ongoing discharge into the sea.
Four workers were cleaning the piping when a drainage hose suddenly came off. They were splashed with the tainted liquid waste, which was not the wastewater running inside the system.
All four were wearing full face masks, and test results showed none of them had ingested radioactive particles. None have shown any health issues, according to plant operator Tokyo Electric Power Company Holdings, or TEPCO.
A fifth worker, who was also assigned to the cleaning work, was temporarily away when the accident occurred.
TEPCO began the controversial wastewater discharges on Aug. 24 from Fukushima Daiichi, which suffered triple meltdowns following the 2011 quake and tsunami. The discharges, which are expected to continue for decades, have been strongly opposed by fishing groups and neighboring countries, including China, which immediately banned imports of all Japanese seafood.
TEPCO has since completed the first two rounds of discharges as planned, and is preparing for a third, beginning in early November. Junichi Matsumoto, a TEPCO executive in charge of the treated waster discharge, told reporters that Wednesday's accident would not affect discharge plans.
Following the accident, two of the four workers were able to rinse off the contamination to the levels that allowed them to leave the plant. The other two, who had the liquid soaked through their double-layer hazmat suits and underwear and could not sufficiently lower the radiation levels, had to be taken to a hospital for further decontamination and monitoring, TEPCO said.
One of the hospitalized workers, in his 20s, was found to have exposures on the whole body except for his face, while the other man, in his 40s, had exposures in the stomach area. Risks for them to get skin burns from the radiation exposure were extremely low, TEPCO said, quoting a doctor who had examined the two workers.
"""

## Importing our model from the huggingface repository

### Username: Heavycoder
### fined-tuned model name: snp_model

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model="Heavycoder/snp_model")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/275 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/20.1k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/6.60M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

## Generating summary

In [None]:
summary = summarizer(input_text)

## Generated summary by our fine-tuned model

In [None]:
summary[0]['summary_text']

'Two workers at the Fukushima Daiichi nuclear power plant were hospitalized after they were accidentally sprayed with radioactive liquid .<n>The incident occurred on Wednesday while they were cleaning the piping of the Advanced Liquid Processing System .<n>None of the workers have shown any health issues, the plant operator TEPCO said .'