# 0. Install dependencies

In [2]:
# Install PyTorch
# https://download.pytorch.org/whl/lts/1.8/torch_lts.html

In [3]:
# Install transformers
!pip install transformers



# 1. Import and Load Model

In [4]:
# Importing dependencies from transformers
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [5]:
!pip install sentencepiece



In [6]:
# Load tokenizer 
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [7]:
# Load model 
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

# 2. Perform Abstractive Summarization

In [8]:
text = """
The X-ray, taken ahead of a forthcoming exhibition, did not catch much of the peasant woman. Instead, it picked up on the “lead white, the much heavier pigment he used for his face,” Stevenson adds.

Van Gogh reused canvases on many occasions to save money, typically opting to paint on the back side of a canvas rather than paint over an existing work. The self-portrait was most likely lost after Jo Bonger, his sister-in-law, lent the canvas to an exhibition in Amsterdam in 1905. Curators at that exhibition probably deemed the self-portrait as less complete, choosing to glue cardboard to the portrait side in order to secure the canvas before framing, per the statement.

The work is likely part of a series of experimental self-portraits van Gogh made in the summer of 1887, a few years after he painted the peasant lady, according to the museum. Five other self-portraits he painted in Nuen—the Dutch town he lived in for two years—on the backs of earlier works are on display at the Van Gogh Museum in the Netherlands.

Thanks to a specially made lightbox, an X-ray of the self-portrait will be on view at the museum’s coming exhibition, “A Taste for Impressionism.”"""

In [9]:
# Create tokens - number representation of our text
tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

In [10]:
# Input tokens
tokens

{'input_ids': tensor([[  139,  1346,   121,  5654,   108,   784,  1573,   113,   114, 12480,
          3388,   108,   368,   146,  2602,   249,   113,   109, 52422,  1590,
           107,  3054,   108,   126,  3093,   164,   124,   109,   185, 21248,
           695,   108,   109,   249, 11959, 20362,   178,   263,   118,   169,
           749,   483, 30934,  3488,   107,  4247, 38622, 28495, 41892,   124,
           223,  5430,   112,   803,   408,   108,  2222, 18208,   112,  1999,
           124,   109,   247,   477,   113,   114,  5999,   880,   197,  1999,
           204,   142,  1385,   201,   107,   139,   813,   121, 46832,   140,
           205,   770,  1166,   244,  6513, 48662,   420,   108,   169,  3051,
           121,   386,   121,  5505,   108, 28352,   109,  5999,   112,   142,
          3388,   115,  9171,   115, 28613,   107, 42825,   116,   134,   120,
          3388,   864,  7595,   109,   813,   121, 46832,   130,   478,   573,
           108,  2832,   112,  7156, 1

In [11]:
# Summarize 
summary = model.generate(**tokens)

In [14]:
{**tokens}

{'input_ids': tensor([[  139,  1346,   121,  5654,   108,   784,  1573,   113,   114, 12480,
           3388,   108,   368,   146,  2602,   249,   113,   109, 52422,  1590,
            107,  3054,   108,   126,  3093,   164,   124,   109,   185, 21248,
            695,   108,   109,   249, 11959, 20362,   178,   263,   118,   169,
            749,   483, 30934,  3488,   107,  4247, 38622, 28495, 41892,   124,
            223,  5430,   112,   803,   408,   108,  2222, 18208,   112,  1999,
            124,   109,   247,   477,   113,   114,  5999,   880,   197,  1999,
            204,   142,  1385,   201,   107,   139,   813,   121, 46832,   140,
            205,   770,  1166,   244,  6513, 48662,   420,   108,   169,  3051,
            121,   386,   121,  5505,   108, 28352,   109,  5999,   112,   142,
           3388,   115,  9171,   115, 28613,   107, 42825,   116,   134,   120,
           3388,   864,  7595,   109,   813,   121, 46832,   130,   478,   573,
            108,  2832,   1

In [15]:
# summary in tokens
summary

tensor([[    0,   202,   813,   121, 46832,  4207,   141, 12389,  4406, 38622,
           154,   197,   114,  1902,   754,   148,   174,  3087,   141,   142,
          1346,   121,  5654,   108,   992,   112,   109,  4247, 38622,  2447,
           115,  9171,   107,     1]])

In [12]:
# Output summary tokens
summary[0]

tensor([    0,   202,   813,   121, 46832,  4207,   141, 12389,  4406, 38622,
          154,   197,   114,  1902,   754,   148,   174,  3087,   141,   142,
         1346,   121,  5654,   108,   992,   112,   109,  4247, 38622,  2447,
          115,  9171,   107,     1])

In [13]:
# Decode summary
tokenizer.decode(summary[0])

'A self-portrait painted by Vincent van Gogh more than a century ago has been identified by an X-ray, according to the Van Gogh Museum in Amsterdam.'

In [None]:
# creating a pickle file
import pickle
pickle_out = open('summarizer.pkl','wb')
