In [1]:
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline

In [2]:
text = '''WaiPRACTICE is a subset of WaiLEARN, our main education program. WaiPractice is a playground where learners, 
enthusiasts, and experts can collaborate and practice their skills, grow their knowledge, share best practices, 
explore new use-cases, and simultaneously grow their networks in the domain of #AI. This program also provides 
access to mentorship opportunities, networking and community building while ensuring continous establishment of 
individual online portfolios.'''

In [3]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [4]:
inputs = tokenizer.batch_encode_plus([text], max_length=1024, return_tensors='pt')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [5]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [6]:
inputs['input_ids'][0]

tensor([    0,   771,  1439,  4454, 13709,  9292,    16,    10, 37105,     9,
          305,  1439,  3850,  2747,   487,     6,    84,  1049,  1265,   586,
            4,   305,  1439, 46029,  2463,    16,    10, 14988,   147, 25929,
            6,  1437, 50118,  1342, 25134,   118, 13651,     6,     8,  2320,
           64, 16075,     8,  1524,    49,  2417,     6,  1733,    49,  2655,
            6,   458,   275,  3464,     6,  1437, 50118, 23242,  1688,    92,
          304,    12, 28162,     6,     8, 11586,  1733,    49,  4836,    11,
            5, 11170,     9,   849, 15238,     4,   152,   586,    67,  1639,
         1437, 50118, 28300,     7, 12906, 11887,  1616,     6, 11745,     8,
          435,   745,   150,  6060, 39077,  1827,  7147,     9,  1437, 50118,
        42333,   804, 16465,     4,     2])

In [7]:
inputs['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1])

In [8]:
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=False)

In [9]:
type(summary_ids)

torch.Tensor

In [10]:
summary_ids

tensor([[    2,     0,   771,  1439,  4454, 13709,  9292,    16,    10, 37105,
             9,   305,  1439,  3850,  2747,   487,     6,    84,  1049,  1265,
           586,     4,   305,  1439, 46029,  2463,    16,    10, 14988,   147,
         25929,     6,  1437,  1342, 25134,   118, 13651,     6,     8,  2320,
            64, 16075,     8,  1524,    49,  2417,     6,  1733,    49,  2655,
             6,   458,   275,  3464,     6,  1437, 23242,  1688,    92,   304,
            12, 28162,     6,     8, 11586,  1733,    49,  4836,    11,     5,
         11170,     9,   849, 15238,     4,   152,   586,    67,  1639,  1437,
         28300,     7, 12906, 11887,  1616,     6, 11745,     8,   435,   745,
           150,  6060, 39077,  1827,  7147,     9,  1736,   804, 16465,     2]])

In [11]:
len(summary_ids[0]) # look at the config model.generate()

100

In [12]:
for ids in summary_ids:
    # decode the tensor of token ids
    short = tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(len(text), len(short))
    print(short)

477 472
WaiPRACTICE is a subset of WaiLEARN, our main education program. WaiPractice is a playground where learners, enthusiasts, and experts can collaborate and practice their skills, grow their knowledge, share best practices, explore new use-cases, and simultaneously grow their networks in the domain of #AI. This program also provides access to mentorship opportunities, networking and community building while ensuring continous establishment of individual online portfolios


In [13]:
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=20, early_stopping=False)
for ids in summary_ids:
    # decode the tensor of token ids
    short = tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(len(text), len(short))
    print(short)

477 45
WaiPRACTICE is a subset of WaiLEARN, our main


In [14]:
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=50, early_stopping=False)
for ids in summary_ids:
    # decode the tensor of token ids
    short = tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(len(text), len(short))
    print(short)

477 189
WaiPRACTICE is a subset of WaiLEARN, our main education program. WaiPractice is a playground where learners, enthusiasts, and experts can collaborate and practice their skills. This program
