# Imports

In [1]:
from transformers import pipeline
from datasets import load_dataset

from transformers import BartTokenizer
from transformers import BertTokenizer
from transformers import RobertaTokenizer
from transformers import T5Tokenizer
from transformers import GPT2Tokenizer
from transformers import ElectraTokenizer
from transformers import PegasusTokenizer
from transformers import AutoTokenizer

import torch

  from .autonotebook import tqdm as notebook_tqdm


# Custom Tokenizers

In [None]:
t = AutoTokenizer.from_pretrained('gpt2', padding_side='left')
t.pad_token = t.eos_token
# t.padding_side
# t('a fhfhs fa sdfhas df asdhf',return_tensors='pt', padding=True)['input_ids']
t([
    'test sentence',
    'this is an even longer test sentence'
], return_tensors='pt', padding=True)

## BERT

In [28]:
class CustomBertTokenizer(BertTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'        
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

## Roberta

In [29]:
class CustomRobertaTokenizer(RobertaTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

## BART

In [12]:
class CustomBartTokenizer(BartTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [39]:
ct = CustomBartTokenizer.from_pretrained('facebook/bart-base')
ct.num_pads = 2
ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'CustomBartTokenizer'.


hello
hello


{'input_ids': tensor([[    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     0,    29, 36807,   281,  1437, 29831,  1236,
           1236,  1236,  1236, 42898,   579,     2],
         [    1,     1,     0,    29, 41587,   506,   449,   385,    29, 29831,
           1236,  1437,  1236, 46155,   939,   257,  1717,  1020,  1021,  9060,
            939,  1438,   939,   687,  5074,     2]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1.]])}

## GPT2

In [30]:
class CustomGPT2Tokenizer(GPT2Tokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        self.pad_token  = self.eos_token
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        

        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [59]:
ct = CustomGPT2Tokenizer.from_pretrained('gpt2')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'CustomGPT2Tokenizer'.


{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256,    82,  7568,   292,   220, 42625,   474,   474,
            474,   474, 44804,   264],
         [50256, 50256,    82, 41255,    69,   479,   288,    82, 42625,   474,
            220,   474, 33245,  1312,    84,   334,   952,   267, 16115,  1312,
            280,  1312,   385,  6507]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.]])}

## T5

In [31]:
class CustomT5Tokenizer(T5Tokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [61]:
ct = CustomT5Tokenizer.from_pretrained('T5-small')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.96MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 4.83MB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'CustomT5Tokenizer'.


hello


{'input_ids': tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    3,    7,   26,   89,    9,    7,    3,   26,
           354,    3,  354,    3,  354,    3,  354,    3,  354,    3,  354,    7,
             3,    7,    1],
         [   0,    0, 6819,  354,   89,    3,  157,    3,   26,    7,    3,   26,
           354,    3,  354,    3,  354,    3,   23,   32,    3,   23,   76,    3,
            76,   23,   32,    3,   32,   23,   76,    3,   23, 1063,    3,   23,
           302, 6819,    1]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1.]])}

## Electra

In [8]:
class CustomElectraTokenizer(ElectraTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        
        default_tokenization = super().__call__(
            *args, 
            return_tensors='pt',
            padding=True,
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [64]:
ct = CustomElectraTokenizer.from_pretrained('google/electra-small-discriminator')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.16MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 61.5kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 1.42MB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'ElectraTokenizer'. 
The class this function is called from is 'CustomElectraTokenizer'.


hello


{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,   101, 17371,  7011,  2015,  6520,  1046,  1046,  1046,  1046,
           1046,  2015,  1055,   102],
         [    0,     0,   101,  6517,  3501,  2546,  1047, 16233,  6520,  1046,
           1046, 22834,  1045,  2226, 21318,  2080,  1051, 17922, 22834,  2226,
           1045,  2271,  6517,   102]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.]])}

## PEGASUS

In [46]:
class CustomPegasusTokenizer(PegasusTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        if kwargs:
            kwargs['padding'] = True
            kwargs['return_tensors'] = 'pt'
            kwargs['truncation'] = True
        
        default_tokenization = super().__call__(
            *args, 
            **kwargs,
        )
        
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [73]:
ct = CustomPegasusTokenizer.from_pretrained('google/pegasus-xsum')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

Downloading (…)ve/main/spiece.model: 100%|██████████| 1.91M/1.91M [00:00<00:00, 3.78MB/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PegasusTokenizer'. 
The class this function is called from is 'CustomPegasusTokenizer'.


{'input_ids': tensor([[    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,   110,   116,   252, 49870, 34566,  7174,  7174,  7174,  7174,
          57716,   110,   116,     1],
         [    0,     0,  4508, 76786,  4817,  3138,   116, 34566,  7174,  7174,
            110,  4430,   110, 21994,  4911,  4430,  4429, 21994,   110, 65065,
            110, 11641,  4508,     1]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1.]])}

# Dataset

## Summarization

### CNN Daily Mail

In [10]:
cnn_dailymail_ds = load_dataset('cnn_dailymail', '3.0.0', split='train')

Found cached dataset cnn_dailymail (/nfs/home/marquez/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


### XSum

In [74]:
xsum_ds = load_dataset('xsum', split='train')

Downloading builder script: 100%|██████████| 5.76k/5.76k [00:00<00:00, 7.18MB/s]
Downloading readme: 100%|██████████| 6.24k/6.24k [00:00<00:00, 8.59MB/s]


Downloading and preparing dataset xsum/default to /nfs/home/marquez/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71...


Downloading data: 100%|██████████| 255M/255M [00:02<00:00, 96.6MB/s]
Downloading data: 2.72MB [00:00, 34.4MB/s]                   .18s/it]
Downloading data files: 100%|██████████| 2/2 [00:05<00:00,  2.77s/it]
                                                                                          

Dataset xsum downloaded and prepared to /nfs/home/marquez/.cache/huggingface/datasets/xsum/default/1.2.0/082863bf4754ee058a5b6f6525d0cb2b18eadb62c7b370b095d1364050a52b71. Subsequent calls will reuse this data.




### GEM - Wikilingua

In [75]:
gem_wikilingua_en_ds = load_dataset('GEM/wiki_lingua', 'en', split='train')

Downloading builder script: 100%|██████████| 9.20k/9.20k [00:00<00:00, 5.76MB/s]
Downloading metadata: 100%|██████████| 770k/770k [00:00<00:00, 1.91MB/s]
Downloading readme: 100%|██████████| 17.8k/17.8k [00:00<00:00, 15.8MB/s]


Downloading and preparing dataset wiki_lingua/en (download: 2.17 GiB, generated: 357.04 MiB, post-processed: Unknown size, total: 2.52 GiB) to /nfs/home/marquez/.cache/huggingface/datasets/GEM___wiki_lingua/en/2.0.0/84e1fa083237de0bf0016a1934d8b659ecafd567f398012ca5d702b7acc97450...


Downloading data: 100%|██████████| 2.34G/2.34G [01:03<00:00, 36.7MB/s] 
                                                                                                  

Dataset wiki_lingua downloaded and prepared to /nfs/home/marquez/.cache/huggingface/datasets/GEM___wiki_lingua/en/2.0.0/84e1fa083237de0bf0016a1934d8b659ecafd567f398012ca5d702b7acc97450. Subsequent calls will reuse this data.




## Classification

# Pipelines

## Summarization

### CNN Daily Mail

#### BART

In [49]:
model_name = "facebook/bart-base"
custom_tokenizer = CustomBartTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 10
summarizer = pipeline(
    "summarization", 
    model=model_name, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(cnn_dailymail_ds[:3]['article'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'CustomBartTokenizer'.


[{'summary_text': 'Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. "I\'ll definitely have some sort of party," Radcliffe said in an interview with Reuters on Monday. Daniel Daniel Rad'},
 {'summary_text': 'An inmate housed on the "forgotten floor," where many mentally ill inmates are housed in Miami before trial. MIAMI, Florida (CNN) -- In our Behind the Scenes series, CNN correspondents share their experiences in covering news and analyze the stories behind the events. Here, Soledad O\'Brien takes viewers inside a jail where many of the inmates are mentally ill. An inmate housed in the "F

#### PEGASUS

In [48]:
from transformers import PegasusForConditionalGeneration, PegasusConfig

model_name = "google/pegasus-xsum"
custom_tokenizer = CustomPegasusTokenizer.from_pretrained(model_name)
custom_tokenizer.num_pads = 1
config = PegasusConfig()
config.vocab_size = custom_tokenizer.vocab_size
model = PegasusForConditionalGeneration.from_pretrained(model_name, config=config)
# print(model.vocab_size)
summarizer = pipeline(
    "summarization", 
    model=model, 
    tokenizer=custom_tokenizer, 
    framework="pt"
)

summarizer(cnn_dailymail_ds[:2]['article'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PegasusTokenizer'. 
The class this function is called from is 'CustomPegasusTokenizer'.
Some weights of the model checkpoint at google/pegasus-xsum were not used when initializing PegasusForConditionalGeneration: ['model.encoder.layers.12.fc1.bias', 'model.encoder.layers.12.self_attn.q_proj.bias', 'model.decoder.layers.14.self_attn_layer_norm.bias', 'model.decoder.layers.13.encoder_attn_layer_norm.weight', 'model.decoder.layers.13.encoder_attn.out_proj.bias', 'model.encoder.layers.13.self_attn.v_proj.bias', 'model.encoder.layers.12.self_attn_layer_norm.weight', 'model.decoder.layers.13.self_attn.k_proj.weight', 'model.encoder.layers.15.fc2.weight', 'model.decoder.layers.15.self_attn.k_proj.bias', 'model.encoder.layers.14.self_attn.q_proj.weight', 'model.encoder.layers.15.self_at

[{'summary_text': 'The The The The The The The The The The The The The ....'},
 {'summary_text': 'The The The The The The The The The The The ,,....'}]