# Imports

In [53]:
from transformers import pipeline
from datasets import load_dataset
from transformers import BartTokenizer
from transformers import BertTokenizer
from transformers import AutoTokenizer

import torch

# Custom Tokenizers

## BERT

In [54]:
t = AutoTokenizer.from_pretrained('bert-base-uncased', padding_side='left')
# t.padding_side
# t('a fhfhs fa sdfhas df asdhf',return_tensors='pt', padding=True)['input_ids']
t([
    'test sentence',
    'this is an even longer test sentence'
], return_tensors='pt', padding=True)

Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.53MB/s]


{'input_ids': tensor([[   0,    0,    0,    0,    0,  101, 3231, 6251,  102],
        [ 101, 2023, 2003, 2019, 2130, 2936, 3231, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

## BART

In [83]:
class CustomTokenizer(BartTokenizer):
    _num_pads: int = 50

    @property
    def num_pads(self):
        return self._num_pads

    @num_pads.setter
    def num_pads(self, value):
        self._num_pads = value

    def __init__(self, **kwargs):
        """
        """
        super().__init__(
            padding_side='left', 
            **kwargs
        )

    def __call__(self, *args, **kwargs):
        """
        """
        default_tokenization = super().__call__(
            *args, 
            return_tensors='pt',
            padding=True,
            **kwargs,
        )
        # input_ids => Tensor(batch size, input size)
        input_ids = default_tokenization['input_ids']
        attention_mask = default_tokenization['attention_mask']

        batch_size = input_ids.shape[0]

        new_pads = torch.tensor([[super().pad_token_id] * self.num_pads] * batch_size)
        new_pads_masks = torch.zeros((new_pads.shape[0], new_pads.shape[1]))

        input_ids = torch.column_stack((new_pads, input_ids))
        attention_mask = torch.column_stack((new_pads_masks, attention_mask))
            
        return {
            'input_ids': input_ids, 
            'attention_mask':attention_mask,
        }

In [84]:
ct = CustomTokenizer.from_pretrained('facebook/bart-base')
ct.num_pads = 2
# ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')
ct([
    'sdfas  dj j j j j js s',
    'sadjf k ds dj j  j io iu uio oiu iou ius sad'
])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'CustomTokenizer'.


{'input_ids': tensor([[    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     0,    29, 36807,   281,  1437, 29831,  1236,
           1236,  1236,  1236, 42898,   579,     2],
         [    1,     1,     0,    29, 41587,   506,   449,   385,    29, 29831,
           1236,  1437,  1236, 46155,   939,   257,  1717,  1020,  1021,  9060,
            939,  1438,   939,   687,  5074,     2]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1.]])}

In [47]:
t = BartTokenizer.from_pretrained('facebook/bart-base', padding_side='left')
# t.padding_side
# t('a fhfhs fa sdfhas df asdhf',return_tensors='pt', padding=True)['input_ids']
t([
    'a fhfhs fa sdfhas df asdhf',
    'aksudjhf sldfkj hsdf hsd fhhf osi f'
], return_tensors='pt', padding=True)['input_ids']

tensor([[    1,     1,     1,     1,     1,     1,     1,     0,   102,   856,
           298,   506, 15354, 18363,   579, 36807,  7333, 47942,    25, 16593,
           506,     2],
        [    0,  6629,  1906,   267,   298,   506,   579,  4779,   506, 36085,
          1368,    29, 36807,  1368, 28045,   856, 36646,   506,  1021, 11000,
           856,     2]])

## T5

# Dataset

In [30]:
ds = load_dataset('cnn_dailymail', '3.0.0', split='train')

Found cached dataset cnn_dailymail (/nfs/home/marquez/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


# Pipelines

In [37]:
ds[0]['article']

'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Details o

In [40]:
summarizer = pipeline("summarization", model="facebook/bart-base", tokenizer=ct, framework="pt")
summarizer(ds[0]['article'])

tensor([[    0,   574,  4524,     6,  1156,    36,  1251,    43,   480,  3268,
         10997,   999,  3028,  7312, 20152,  3077,   899,     7,    10,   431,
           984,   844,   153,  1358,  4006,     4,   134,   153,    43, 13016,
            25,    37,  4072,   504,    15,   302,     6,    53,    37,  9838,
             5,   418,   351,    75,  2471,    10,  8921,    15,   123,     4,
          3028,  7312, 20152,    25,  3268, 10997,    11,    22, 29345, 10997,
             8,     5,  9729,     9,     5,  5524,   113,   598,     5, 10208,
             9, 20445,  6730,  1952,   198,     5,   232,     6,     5,   664,
          2701,   161,    37,    34,   117,   708,     7,   856,  3961,  1334,
            39,  1055,   409,    15,  1769,  1677,     6,  4076,     8,  6794,
          1799,     4,    22,   100,   218,    75,   563,     7,    28,    65,
             9,   167,    82,    54,     6,    25,  1010,    25,    51,  1004,
           504,     6,  6017,   907,  1235,    10,  

  input_ids = torch.concat((pads, torch.tensor(shifted_ids)))


RuntimeError: Tensors must have same number of dimensions: got 1 and 2