# Imports

In [1]:
from transformers import pipeline
from datasets import load_dataset

from transformers import BartTokenizer
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import T5Tokenizer
from transformers import GPT2Tokenizer
from transformers import ElectraTokenizer
from transformers import PegasusTokenizer
from transformers import AutoTokenizer

import torch

  from .autonotebook import tqdm as notebook_tqdm


# Custom Tokenizers

In [2]:
t = AutoTokenizer.from_pretrained('gpt2', padding_side='left')
t.pad_token = t.eos_token
# t.padding_side
# t('a fhfhs fa sdfhas df asdhf',return_tensors='pt', padding=True)['input_ids']
t([
    'test sentence',
    'this is an even longer test sentence'
], return_tensors='pt', padding=True)

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256,  9288,  6827],
        [ 5661,   318,   281,   772,  2392,  1332,  6827]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}

## BERT

In [3]:
class CustomBertTokenizer(BertTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'        
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

## Roberta

In [4]:
class CustomRobertaTokenizer(RobertaTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

## BART

In [52]:
# TEST

class CustomTokenizer():
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, model_name):
        """
        """
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            padding_side='left',
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs["padding"] = True,
        kwargs["return_tensors"] = 'pt',
        kwargs["truncation"] = True,


        default_tokenization = self.tokenizer(
            *args, **kwargs
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.tokenizer.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[self.tokenizer.pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [54]:
ct = CustomTokenizer('facebook/bart-large')
ct.num_pads = 2
ct([
    'this is a sentence',
    'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
])

Max length before padding: 20.


{'input_ids': tensor([[    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     0,  9226,    16,    10,
           3645,     2],
         [    1,     1,     0, 29345, 10997,   999,  3028,  7312, 20152,  4072,
            504,    15,   475, 46328,   479,     5,   664,  2701,   161,    37,
             34,     2]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [43]:
class CustomBartTokenizer(AutoTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [46]:
ct = CustomBartTokenizer.from_pretrained('facebook/bart-large', use_fast=False)
print(ct)
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
# ct([
#     'this is a sentence',
#     'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
# ])

BartTokenizer(name_or_path='facebook/bart-large', vocab_size=50265, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)


## GPT2

In [7]:
class CustomGPT2Tokenizer(GPT2Tokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        self.pad_token  = self.eos_token
        
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        

        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [8]:
ct = CustomGPT2Tokenizer.from_pretrained('gpt2')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'this is a sentence',
    'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'CustomGPT2Tokenizer'.


Max length before padding: 18.


{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256,  5661,   318,   257,  6827],
         [50256, 50256, 18308, 14179,  3491,  7806,  5325, 33783,  4962,  1248,
            319,   285,  3204,   764,   262,  1862,  8674,  1139,   339,   468]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## T5

In [9]:
class CustomT5Tokenizer(T5Tokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [10]:
ct = CustomT5Tokenizer.from_pretrained('T5-small')
ct.num_pads = 2
ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'this is a sentence',
    'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'CustomT5Tokenizer'.


Max length before padding: 22.
Max length before padding: 22.


{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,    48,    19,
              3,     9,  7142,     1],
         [    0,     0,  8929, 16023,  2213,  4173,  6324, 12591,    15,  5050,
            507,    30,  1911,  1135,     3,     5,     8,  1021,  7556,   845,
              3,    88,    65,     1]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## Electra

In [11]:
class CustomElectraTokenizer(ElectraTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [12]:
ct = CustomElectraTokenizer.from_pretrained('google/electra-small-discriminator')
ct.num_pads = 2
ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'this is a sentence',
    'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'CustomElectraTokenizer'.


Max length before padding: 18.
Max length before padding: 18.


{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   101,  2023,  2003,  1037,  6251,   102],
         [    0,     0,   101,  4302, 10693,  2732,  3817, 22603,  4332,  2324,
           2006,  6928,  1012,  1996,  2402,  3364,  2758,  2002,  2038,   102]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## PEGASUS

In [13]:
class CustomPegasusTokenizer(PegasusTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        kwargs['padding'] = True
        kwargs['return_tensors'] = 'pt'
        kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]
        print(f'Max length before padding: {input_ids.shape[1]}.')

        # No more pads can be added
        if input_ids.shape[1] + self.num_pads > self.model_max_length:
            print(f"WARNING: Input max length with added pads is bigger than model's max length. No pads were added.")
            return default_tokenization

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size, dtype=int)
        new_pads_masks = torch.tensor([[0] * self.num_pads] * batch_size, dtype=int)

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [14]:
ct = CustomPegasusTokenizer.from_pretrained('google/pegasus-xsum')
ct.num_pads = 2
ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'this is a sentence',
    'Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PegasusTokenizer'. 
The class this function is called from is 'CustomPegasusTokenizer'.


Max length before padding: 18.
Max length before padding: 18.


{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,   136,   117,   114,  5577,     1],
         [    0,     0,  5849, 10173,  2187,  4767, 59988,  3043,  1204,   124,
          42993,   110,   107,   109,   758,  5102,   649,   178,   148,     1]]),
 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
         [0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Datasets

## Summarization

### CNN Daily Mail

In [15]:
cnn_dailymail_ds = load_dataset('cnn_dailymail', '3.0.0', split='train')

Found cached dataset cnn_dailymail (/nfs/home/marquez/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


### XSum

In [16]:
xsum_ds = load_dataset('xsum', split='train')

Found cached dataset xsum (/nfs/home/marquez/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71)


In [36]:
xsum_ds

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})

### GEM - Wikilingua

In [17]:
gem_wikilingua_en_ds = load_dataset('GEM/wiki_lingua', 'en', split='train')

Found cached dataset wiki_lingua (/nfs/home/marquez/.cache/huggingface/datasets/GEM___wiki_lingua/en/2.0.0/84e1fa083237de0bf0016a1934d8b659ecafd567f398012ca5d702b7acc97450)


In [38]:
gem_wikilingua_en_ds[0]

{'gem_id': 'wikilingua_multilingual-train-424377',
 'gem_parent_id': 'wikilingua_multilingual-train-424377',
 'source_language': 'en',
 'target_language': 'en',
 'source': 'Honesty is usually the best policy. It is disrespectful to lie to someone. If you don\'t want to date someone, you should say so.  Sometimes it is easy to be honest. For example, you might be able to truthfully say, "No, thank you, I already have a date for that party." Other times, you might need to find a kinder way to be nice. Maybe you are not attracted to the person. Instead of bluntly saying that, try saying, "No, thank you, I just don\'t think we would be a good fit." Avoid making up a phony excuse. For instance, don\'t tell someone you will be out of town this weekend if you won\'t be. There\'s a chance that you might then run into them at the movies, which would definitely cause hurt feelings. A compliment sandwich is a really effective way to provide feedback. Essentially, you "sandwich" your negative comm

## Classification

In [39]:
imdb_ds = load_dataset("imdb", split="train")



In [40]:
imdb_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

# Pipelines

## Summarization

### CNN Daily Mail

#### GPT2

In [113]:
model_name = "gavin124/gpt2-finetuned-cnn-summarization-v2"
custom_tokenizer = CustomGPT2Tokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 0


summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(cnn_dailymail_ds[:10]['article'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'CustomGPT2Tokenizer'.
The model 'GPT2LMHeadModel' is not supported for summarization. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForCond

Max length before padding: 563.
Max length before padding: 886.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 917, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 529, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Max length before padding: 917.
Max length before padding: 529.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1024, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 884, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Max length before padding: 1024.
Max length before padding: 884.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1024, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 452, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Max length before padding: 1024.
Max length before padding: 452.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 642, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 464, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Max length before padding: 642.
Max length before padding: 464.


[{'summary_text': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box offic

#### BART

In [55]:
model_name = "facebook/bart-large"
custom_tokenizer = CustomTokenizer(model_name)
custom_tokenizer.num_pads = 0


summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(cnn_dailymail_ds[0]['article'])

TypeError: CustomTokenizer.__call__() got an unexpected keyword argument 'padding'

In [18]:
model_name = "facebook/bart-large"
custom_tokenizer = CustomBartTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 0


summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(cnn_dailymail_ds[:10]['article'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'CustomBartTokenizer'.


Max length before padding: 565.
Max length before padding: 888.
Max length before padding: 919.
Max length before padding: 531.
Max length before padding: 1024.
Max length before padding: 886.
Max length before padding: 1024.
Max length before padding: 454.
Max length before padding: 644.
Max length before padding: 466.


[{'summary_text': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Despite his growing fame and riches, the actor says he is keeping his feet firmly on the ground. "People are always looking to say \'kid star goes off the rails,\'" he told reporters last month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of'},
 {'summary_text': 'Editor\'s note: In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes us inside a jail where many of the inmates are mentally ill. MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illne

#### PEGASUS

In [115]:
# from transformers import PegasusForConditionalGeneration, PegasusConfig

model_name = "google/pegasus-large"
custom_tokenizer = CustomPegasusTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 0

# config = PegasusConfig()
# config.vocab_size = custom_tokenizer.vocab_size
# model = PegasusForConditionalGeneration.from_pretrained(model_name, config=config)

# print(model.vocab_size)
summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)


summarizer(cnn_dailymail_ds[:10]['article'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PegasusTokenizer'. 
The class this function is called from is 'CustomPegasusTokenizer'.


Max length before padding: 560.
Max length before padding: 875.
Max length before padding: 914.
Max length before padding: 516.
Max length before padding: 1024.
Max length before padding: 898.
Max length before padding: 1024.
Max length before padding: 442.
Max length before padding: 642.
Max length before padding: 455.


[{'summary_text': 'Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "But I try very hard not to go that way because it would be too easy for them." His latest outing as the boy wizard in "Harry Potter and the Order of the Phoenix" is breaking records on both sides of the Atlantic and he will reprise the role in the last two films.'},
 {'summary_text': 'MIAMI, Florida (CNN) -- The ninth floor of the Miami-Dade pretrial detention facility is dubbed the "forgotten floor." Here, inmates with the most severe mental illnesses are incarcerated until they\'re ready to appear in court. Most often, the

### XSum

#### PEGASUS

In [98]:
model_name = "google/pegasus-xsum"
custom_tokenizer = CustomPegasusTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 0

summarizer = pipeline(
    task="summarization",
    model=model_name,
    tokenizer=custom_tokenizer,
    framework="pt"
)

summarizer(xsum_ds[:10]['document'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PegasusTokenizer'. 
The class this function is called from is 'CustomPegasusTokenizer'.


Max length before padding: 475.
Max length before padding: 178.
Max length before padding: 512.
Max length before padding: 305.
Max length before padding: 208.
Max length before padding: 512.
Max length before padding: 406.
Max length before padding: 423.
Max length before padding: 496.
Max length before padding: 123.


[{'summary_text': 'A clean-up operation is under way in parts of Dumfries and Galloway hit by flooding over the weekend.'},
 {'summary_text': 'Two tourist buses have been destroyed in a suspected arson attack in Londonderry.'},
 {'summary_text': 'Lewis Hamilton beat Mercedes team-mate Nico Rosberg to pole position at the Bahrain Grand Prix.'},
 {'summary_text': 'A former Lincolnshire Police officer has gone on trial accused of sexually abusing boys in the 1970s and 80s.'},
 {'summary_text': 'Turkish police have ended a siege at a psychiatric hospital in Istanbul.'},
 {'summary_text': 'Glasgow Warriors made it two wins out of two in the Pro12 with a bonus-point victory over the Dragons at Scotstoun.'},
 {'summary_text': 'A man police want to trace in connection with a fatal hit-and-run crash in south London has been identified.'},
 {'summary_text': 'Welsh cyclist Luke Rowe has called for a speed limit to be put in place following the death of Pierre Demoitie.'},
 {'summary_text': 'Manch

#### BART

In [102]:
from transformers import BartForConditionalGeneration, BartConfig

model_name = "facebook/bart-large-xsum"
custom_tokenizer = CustomBartTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 10

# config = BartConfig()
# config.vocab_size = custom_tokenizer.vocab_size
# model = BartForConditionalGeneration.from_pretrained(model_name, config=config)

summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(xsum_ds[:10]['document'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'CustomBartTokenizer'.


Max length before padding: 516.
Max length before padding: 186.
Max length before padding: 1024.
Max length before padding: 329.
Max length before padding: 220.
Max length before padding: 815.
Max length before padding: 434.
Max length before padding: 444.
Max length before padding: 523.
Max length before padding: 135.


[{'summary_text': 'The impact of flooding in Dumfries and Galloway and the Borders is continuing to be felt.'},
 {'summary_text': 'Two tour buses have been destroyed in a suspected arson attack in Londonderry.'},
 {'summary_text': 'Nico Rosberg will start the Bahrain Grand Prix from pole position after a close battle with Mercedes team-mate Lewis Hamilton.'},
 {'summary_text': 'A former police and scout leader sexually abused two boys, a court has heard.'},
 {'summary_text': 'An armed man has been arrested at a psychiatric hospital in the Turkish city of Istanbul, police say.'},
 {'summary_text': 'Glasgow Warriors scored four second-half tries to beat Newport Gwent Dragons in the Pro12 at Scotstoun.'},
 {'summary_text': 'A man wanted in connection with a "horrific" car crash which killed a woman at a bus stop in south-east London has been named by police.'},
 {'summary_text': 'Welsh cyclist Luke Rowe has called for a limit on the speed at which motorbikes can overtake riders following 

In [33]:
cnn_dailymail_ds_sample = cnn_dailymail_ds.train_test_split(test_size=0.0001)['test']
cnn_dailymail_ds_sample

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 29
})

In [34]:
from evaluate import SummarizationEvaluator

eval = SummarizationEvaluator()
eval.compute(
    model_or_pipeline=summarizer,
    data=cnn_dailymail_ds_sample,
    metric='rouge',
    input_column='article',
    label_column='highlights',
)
# eval(cnn_dailymail_ds[:10])

Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 660.
Max length before padding: 538.
Max length before padding: 480.
Max length before padding: 453.
Max length before padding: 611.
Max length before padding: 609.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 978.
Max length before padding: 329.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 999.
Max length before padding: 672.
Max length before padding: 1024.
Max length before padding: 937.
Max length before padding: 751.
Max length before padding: 1024.
Max length before padding: 961.
Max length before padding: 1024.
Max length before padding: 428.
Max length before padding: 888.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 335.
Max length before padding: 369.


{'rouge1': 0.28023202824862414,
 'rouge2': 0.09006984889192274,
 'rougeL': 0.16835324125413131,
 'rougeLsum': 0.2249814018441693,
 'total_time_in_seconds': 86.9716996671632,
 'samples_per_second': 0.3334417990102723,
 'latency_in_seconds': 2.9990241264539033}

In [35]:
summarizer.tokenizer.num_pads = 50
eval = SummarizationEvaluator()
eval.compute(
    model_or_pipeline=summarizer,
    data=cnn_dailymail_ds_sample,
    metric='rouge',
    input_column='article',
    label_column='highlights',
)

Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 660.
Max length before padding: 538.
Max length before padding: 480.
Max length before padding: 453.
Max length before padding: 611.
Max length before padding: 609.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 978.
Max length before padding: 329.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 999.
Max length before padding: 672.
Max length before padding: 1024.
Max length before padding: 937.
Max length before padding: 751.
Max length before padding: 1024.
Max length before padding: 961.
Max length before padding: 1024.
Max length before padding: 428.
Max length before padding: 888.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 1024.
Max length before padding: 335.
Max length before padding: 369.


{'rouge1': 0.27693118799604594,
 'rouge2': 0.08866997214479702,
 'rougeL': 0.16583266535656624,
 'rougeLsum': 0.2201756834773786,
 'total_time_in_seconds': 87.04808138916269,
 'samples_per_second': 0.3331492152061429,
 'latency_in_seconds': 3.0016579789366444}