## Pegasus CNN-Dailymail  zero-shot for summarization

### Install Libraries

In [None]:
!pip install --upgrade transformers
!pip install datasets
!pip install rouge_score
!pip install rouge
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 39.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


### Loading the dataset

In [None]:
from google.colab import drive
from datasets import load_dataset, load_metric, Dataset
import pandas as pd

drive.mount('/content/drive')
path = "/content/drive/MyDrive/NN/amazon_review_dataset_processed.csv"
df = pd.read_csv(path)
df['reviewText'] = df['reviewText'].str.slice(0, 2500)
amazon = Dataset.from_pandas(df)
amazon.shape

Mounted at /content/drive


(11848, 3)

In [None]:
import torch
if torch.cuda.is_available():
   device = torch.device("cuda")
else:
   device = torch.device("cuda")

### Train-test split

In [None]:
NotTest_Test = amazon.train_test_split(test_size=0.1, seed=42)
NotTest = NotTest_Test["train"]
test = NotTest_Test["test"]

Train_Val = NotTest.train_test_split(test_size=0.1, seed=42)
train = Train_Val["train"]
val = Train_Val["test"]

print(train.shape, val.shape, test.shape)

(9596, 3) (1067, 3) (1185, 3)


### Loading the Pegasus-CNN Dailymail model

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import numpy as np
import torch
model_name = "google/pegasus-cnn_dailymail"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

### Generating the summaries

In [None]:
all_outputs = []
for i in range(200):
  batch = tokenizer(test['reviewText'][i:i+1], return_tensors="pt").to(device)
  translated = model.generate(**batch, num_beams = 2, max_length=25, min_length=2, do_sample=True)
  tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
  all_outputs.append(tgt_text)

### Evaluating using Rouge scores

In [None]:
metric = load_metric('rouge')
flattened_outputs = np.array(all_outputs).flatten()
values = metric.compute(predictions=test['summary'][:200], references = flattened_outputs, use_stemmer=True)
values

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

{'rouge1': AggregateScore(low=Score(precision=0.17032602038215922, recall=0.1060827676842561, fmeasure=0.12936969780945934), mid=Score(precision=0.19804070008723645, recall=0.12318763629927221, fmeasure=0.14894389657047435), high=Score(precision=0.22588076399381068, recall=0.13948729069196508, fmeasure=0.16849677850234474)),
 'rouge2': AggregateScore(low=Score(precision=0.053922614068937606, recall=0.03189579186027436, fmeasure=0.03924985742130454), mid=Score(precision=0.07443821109446108, recall=0.0438991053913444, fmeasure=0.054056516059266285), high=Score(precision=0.0982859813797314, recall=0.057069976697209264, fmeasure=0.0699001991837544)),
 'rougeL': AggregateScore(low=Score(precision=0.14317200000990143, recall=0.08927571073535237, fmeasure=0.10858671255647809), mid=Score(precision=0.1682557947083876, recall=0.10433923799032654, fmeasure=0.126259372259735), high=Score(precision=0.1958666664363114, recall=0.12133432208367312, fmeasure=0.14673690682286933)),
 'rougeLsum': Aggrega

### Printing the generated summaries

In [None]:
print(flattened_outputs)

['Average write super annoy heroine hot sex totally abrupt end book need come one allergy warn.<n>Cole allergy warn aside'
 'cpr v202 should include option for block phone number.<n>Caller id signal particularly weak might use splitter'
 'theres katherine garbera eve gaddy whiskey river book 1 theres sexy attorney ryder for'
 'otmar suitner follow somewhat cult conductor yet grossly unfair one busiest active musicians career.<n>Music director 1964 1990'
 'Young pervert tomoki sakurai constantly mind one girl.<n>Hiyori kazane girl desperately'
 '6d new approach full frame camera probably forward-thinking production.<n>6d cameras are smaller 20mp and'
 "long run anime series find wonderful  inuyasha '' '' one special title call total package cool character cool storyline"
 'although procol harum always sell well come albums hit single limit three u.s. two u.k.'
 'Eye-fi card requires internet connection pass confidential data internet various unknown entities without consent.<n>Eye-fi ca

### Printing the actual summaries

In [None]:
print(test['summary'][:200])

["decent write problem annoy heroine 's abrupt end might want pass one", 'well ... far good us use verizon fios single family home dect phone', 'shes determine go away present ryder let go easily ryder ford addi', '4.5 star underrate performance surprise delight throughout -minor vocal issue prevent full 5 star', 'entertain film fan watch previous series full fan-service', 'canon 6d bring modern tech classic shoot technique way keep photographer involve', "eight year hiatus original anime series `` inuyasha '' back first half final act", "`` salty dog '' allow procol harum sail uncharted waters-the salvo edition sound extremely good", 'eye.fi software steal account passwords review vote eye-fi shill read fast', 'excellent case lot nice feature auto sleep/wake function price right', 'author carly phillips erika wilde proud wonderful debut new series really great', 'great fun intelligent well-read creative type much lazy parent people hat english light class', 'astak ultrafast battery ch