## 0. Install dependencies

In [1]:
!pip install torch==1.8.2 torchvision==0.9.2 torchaudio===0.8.2 --extra-index-url https://download.pytorch.org/whl/lts/1.8/cpu

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/lts/1.8/cpu


In [2]:
# Install transformers
!pip install transformers



# Import and Load Model

In [3]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration
# from transformers import PegasusTokenizer
from transformers import AutoTokenizer

import torch
# PegasusTokenizer requires the SentencePiece library

# PegasusTokenizer
# tokenizer converts the sentences into a set of tokens
# number representation of our sentences rather than passing through the words

In [4]:
!pip install sentencepiece



In [5]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [8]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [9]:
!pip install transformers



In [10]:
# Load tokenizer 
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

# Perform Abstractive Summarization

In [12]:
!pip install goose3
from goose3 import Goose



In [13]:
g = Goose() # Goose is the class which will be used to extract the text from the url.
url = 'https://www.sciencedaily.com/releases/2021/08/210811162816.htm'
article = g.extract(url)

In [14]:
article.title

'Global warming begets more warming, new paleoclimate study finds'

In [15]:
article.infos

{'meta': {'description': "Global warming begets more, extreme warming, new paleoclimate study finds. Researchers observe a 'warming bias' over the past 66 million years that may return if ice sheets disappear.",
  'lang': 'en',
  'keywords': 'Global Warming; Climate; Environmental Issues; Earth Science; Early Climate; Fossils; Origin of Life; Evolution',
  'favicon': '',
  'canonical': 'https://www.sciencedaily.com/releases/2021/08/210811162816.htm',
  'encoding': 'utf-8'},
 'image': None,
 'domain': 'www.sciencedaily.com',
 'title': 'Global warming begets more warming, new paleoclimate study finds',
 'cleaned_text': 'It is increasingly clear that the prolonged drought conditions, record-breaking heat, sustained wildfires, and frequent, more extreme storms experienced in recent years are a direct result of rising global temperatures brought on by humans\' addition of carbon dioxide to the atmosphere. And a new MIT study on extreme climate events in Earth\'s ancient history suggests tha

In [16]:
article.cleaned_text

'It is increasingly clear that the prolonged drought conditions, record-breaking heat, sustained wildfires, and frequent, more extreme storms experienced in recent years are a direct result of rising global temperatures brought on by humans\' addition of carbon dioxide to the atmosphere. And a new MIT study on extreme climate events in Earth\'s ancient history suggests that today\'s planet may become more volatile as it continues to warm.\n\nThe study, appearing today in , examines the paleoclimate record of the last 66 million years, during the Cenozoic era, which began shortly after the extinction of the dinosaurs. The scientists found that during this period, fluctuations in the Earth\'s climate experienced a surprising "warming bias." In other words, there were far more warming events -- periods of prolonged global warming, lasting thousands to tens of thousands of years -- than cooling events. What\'s more, warming events tended to be more extreme, with greater shifts in temperatu

In [17]:
len(article.cleaned_text)

6770

In [18]:
text = article.cleaned_text

In [19]:
# Create tokens - number representation of our text
tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

In [20]:
tokens

{'input_ids': tensor([[  168,   117,  3632,   786,   120,   109, 13488, 11945,  1047,   108,
          1093,   121, 13802,  1206,   108,  8123, 39471,   108,   111,  5030,
           108,   154,  3837, 11913,  1267,   115,   909,   231,   127,   114,
          1443,   711,   113,  4220,  1122,  4374,  1457,   124,   141,  4095,
           131,   663,   113,  3359, 13405,   112,   109,  2918,   107,   325,
           114,   177, 14842,   692,   124,  3837,  2354,   702,   115,  2774,
           131,   116,  3266,   689,  4079,   120,   380,   131,   116,  3909,
           218,   460,   154, 16434,   130,   126,  2138,   112,  1515,   107,
           139,   692,   108,  8741,   380,   115,   110,   108, 12414,   109,
         33375, 47207,  1093,   113,   109,   289,  8449,   604,   231,   108,
           333,   109,   597, 17711, 73730,  4065,   108,   162,  1219,  5683,
           244,   109, 22526,   113,   109, 23105,   107,   139,  4182,   374,
           120,   333,   136,   908,  

In [21]:
type(tokens)

transformers.tokenization_utils_base.BatchEncoding

In [22]:
# Summarize
summary = model.generate(**tokens)

In [23]:
{**tokens}

{'input_ids': tensor([[  168,   117,  3632,   786,   120,   109, 13488, 11945,  1047,   108,
           1093,   121, 13802,  1206,   108,  8123, 39471,   108,   111,  5030,
            108,   154,  3837, 11913,  1267,   115,   909,   231,   127,   114,
           1443,   711,   113,  4220,  1122,  4374,  1457,   124,   141,  4095,
            131,   663,   113,  3359, 13405,   112,   109,  2918,   107,   325,
            114,   177, 14842,   692,   124,  3837,  2354,   702,   115,  2774,
            131,   116,  3266,   689,  4079,   120,   380,   131,   116,  3909,
            218,   460,   154, 16434,   130,   126,  2138,   112,  1515,   107,
            139,   692,   108,  8741,   380,   115,   110,   108, 12414,   109,
          33375, 47207,  1093,   113,   109,   289,  8449,   604,   231,   108,
            333,   109,   597, 17711, 73730,  4065,   108,   162,  1219,  5683,
            244,   109, 22526,   113,   109, 23105,   107,   139,  4182,   374,
            120,   333,   1

In [24]:
# Summary in tokens
summary

tensor([[    0,   202,   177,   692,  4079,   120,   109,  2774,   131,   116,
          2354,   218,   460,   154, 16434,   130,   126,  2138,   112,  1515,
           107,     1]])

In [25]:
# Decode summary
tokenizer.decode(summary[0])

"<pad> A new study suggests that the Earth's climate may become more volatile as it continues to warm.</s>"