In [None]:
%%capture
! pip install transformers
! pip install datasets
! pip install plotly

In [1]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, BartForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
name = "valhalla/bart-large-sst2"
model =  BartForSequenceClassification.from_pretrained(name).cuda()
tokenizer = AutoTokenizer.from_pretrained(name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
%matplotlib inline

from datasets import load_dataset
dataset_name = "sst2"

dataset = load_dataset(dataset_name)

Downloading builder script:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading and preparing dataset sst2/default to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset sst2 downloaded and prepared to /root/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def get_padded_inputs(tokenizer, sent, num_pad):

  tok = tokenizer(sent)
  input_ids = tok['input_ids']
  input_ids = torch.tensor([[tokenizer.cls_token_id] + [tokenizer.pad_token_id]*num_pad + input_ids[1:]]).cuda()

  attention_mask = torch.ones(1, len(input_ids[0])).cuda()
  attention_mask[:, 1:1+num_pad] = 0
      
  return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [None]:
import torch
criterion = torch.nn.CrossEntropyLoss()

In [None]:
from torch.utils.data import DataLoader
valloader = DataLoader(dataset['validation'], batch_size = 64)

In [None]:
def get_padded_batch(tokenizer, batch_sentences, num_pad):

  sentences = [tokenizer.pad_token * num_pad + sentence for sentence in batch_sentences]
  inputs = tokenizer(sentences, return_tensors='pt', padding=True)
  inputs['attention_mask'][:, 1:num_pad + 1] = 0
  return {key:value.cuda() for key, value in inputs.items()}

In [None]:
from sklearn.metrics import precision_recall_fscore_support

split = 'validation'
results = []
ce_loss = []
num_sentences = []

for num_pads in range(0, 490):#model.config.max_position_embeddings):
  
  y_true = []
  y_pred = []
  tot_loss = 0
  N = len(dataset[split])
  cnt = 0
  for batch in valloader:
    
    labels = batch['label']
    cnt += len(labels)
    print(f'\r[{cnt}/{N}] ', end='')
    y_true += labels
    
    with torch.no_grad():
      inputs = get_padded_batch(tokenizer, batch['sentence'], num_pads)
      logits = model(**inputs).logits.cpu()
      loss = criterion(logits, labels)
      predicted_class_ids = torch.where(torch.softmax(logits, -1) > 0.5)[1].cpu().numpy().tolist()
      
      y_pred += predicted_class_ids
      tot_loss += loss.item()


  f1 = precision_recall_fscore_support(y_true, y_pred, average='macro')
  results.append(f1)
  ce_loss.append(tot_loss)
  print(f" [{num_pads}] F1 = {f1} cross_entropy_loss = {tot_loss:.3f}")


[872/872]  [0] F1 = (0.9529629921881164, 0.9531131598888608, 0.95297664220712, None) cross_entropy_loss = 2.777
[872/872]  [1] F1 = (0.9518022852092671, 0.9519449355897954, 0.9518285272345597, None) cross_entropy_loss = 2.941
[872/872]  [2] F1 = (0.956389537697536, 0.9565336364401784, 0.956416286545554, None) cross_entropy_loss = 2.895
[872/872]  [3] F1 = (0.956389537697536, 0.9565336364401784, 0.956416286545554, None) cross_entropy_loss = 2.912
[872/872]  [4] F1 = (0.9552336561679653, 0.955365412141113, 0.9552681111667936, None) cross_entropy_loss = 2.864
[872/872]  [5] F1 = (0.9552298877787774, 0.9553233139681738, 0.9552652867938676, None) cross_entropy_loss = 2.906
[872/872]  [6] F1 = (0.9552298877787774, 0.9553233139681738, 0.9552652867938676, None) cross_entropy_loss = 2.933
[872/872]  [7] F1 = (0.9563762626262626, 0.9564915382672392, 0.9564137640745027, None) cross_entropy_loss = 2.929
[872/872]  [8] F1 = (0.954088182679154, 0.9541550896691084, 0.9541166132588255, None) cross_ent

OutOfMemoryError: ignored

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[2] for x in results],
                    mode='lines+markers',
                    name='F1-score'))

fig.update_layout(title = 'F1 vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.93, 1))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[0] for x in results],
                    mode='lines+markers',
                    name='PRECISION'))

fig.update_layout(title = 'PREC vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.93, 1))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[1] for x in results],
                    mode='lines+markers',
                    name='RECALL'))

fig.update_layout(title = 'RECALL vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.93, 1))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(ce_loss))), y=ce_loss,
                    mode='lines+markers',
                    name='Cross Entropy'))

fig.update_layout(title = 'Cross Entroy vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
# fig.update_yaxes(range=(0.93, 1))

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
name = "philschmid/tiny-bert-sst2-distilled"
model =  BertForSequenceClassification.from_pretrained(name).cuda()
tokenizer = AutoTokenizer.from_pretrained(name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

split = 'validation'
results = []
ce_loss = []
num_sentences = []

for num_pads in range(0, 490):#model.config.max_position_embeddings):
  
  y_true = []
  y_pred = []
  tot_loss = 0
  N = len(dataset[split])
  cnt = 0
  for batch in valloader:
    
    labels = batch['label']
    cnt += len(labels)
    print(f'\r[{cnt}/{N}] ', end='')
    y_true += labels
    
    with torch.no_grad():
      inputs = get_padded_batch(tokenizer, batch['sentence'], num_pads)
      logits = model(**inputs).logits.cpu()
      loss = criterion(logits, labels)
      predicted_class_ids = torch.where(torch.softmax(logits, -1) > 0.5)[1].cpu().numpy().tolist()
      
      y_pred += predicted_class_ids
      tot_loss += loss.item()


  f1 = precision_recall_fscore_support(y_true, y_pred, average='macro')
  results.append(f1)
  ce_loss.append(tot_loss)
  print(f" [{num_pads}] F1 = {f1} cross_entropy_loss = {tot_loss:.3f}")


[872/872]  [0] F1 = (0.8336963097398669, 0.8338069377789004, 0.8336978809764002, None) cross_entropy_loss = 11.578
[872/872]  [1] F1 = (0.8246197793641858, 0.8247137324240128, 0.8245355154604601, None) cross_entropy_loss = 11.825
[872/872]  [2] F1 = (0.8234433853081666, 0.8235455081249474, 0.8233861337177186, None) cross_entropy_loss = 12.265
[872/872]  [3] F1 = (0.8180944861165943, 0.8179990738401954, 0.8176583922406707, None) cross_entropy_loss = 12.458
[872/872]  [4] F1 = (0.810802493490097, 0.8109055316999243, 0.810767622166545, None) cross_entropy_loss = 12.785
[872/872]  [5] F1 = (0.8119752540874945, 0.8120737559989897, 0.8119177008422458, None) cross_entropy_loss = 12.934
[872/872]  [6] F1 = (0.8039217233488518, 0.8040224804243495, 0.803886444790783, None) cross_entropy_loss = 13.152
[872/872]  [7] F1 = (0.8016281333017702, 0.8017281299991581, 0.8015927189988623, None) cross_entropy_loss = 13.513
[872/872]  [8] F1 = (0.8038301051845069, 0.8038540877325924, 0.8038410385195729, No

Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors


[832/872] [872/872]  [457] F1 = (0.7891824751580849, 0.7156268417950661, 0.6996499153020892, None) cross_entropy_loss = 22.073
[64/872] [128/872] 

RuntimeError: ignored

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[2] for x in results],
                    mode='lines+markers',
                    name='F1-score'))

fig.update_layout(title = 'F1 vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.55, 0.85))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[0] for x in results],
                    mode='lines+markers',
                    name='PRECISION'))

fig.update_layout(title = 'PREC vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.55, 0.85))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[1] for x in results],
                    mode='lines+markers',
                    name='RECALL'))

fig.update_layout(title = 'RECALL vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.55, 0.85))

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(ce_loss))), y=ce_loss,
                    mode='lines+markers',
                    name='Cross Entropy'))

fig.update_layout(title = 'Cross Entroy vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
# fig.update_yaxes(range=(0.93, 1))

In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification
name = "WillHeld/roberta-base-sst2"
model =  RobertaForSequenceClassification.from_pretrained(name).cuda()
tokenizer = AutoTokenizer.from_pretrained(name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/994 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

split = 'validation'
results = []
ce_loss = []
num_sentences = []

for num_pads in range(0, 490):#model.config.max_position_embeddings):
  
  y_true = []
  y_pred = []
  tot_loss = 0
  N = len(dataset[split])
  cnt = 0
  for batch in valloader:
    
    labels = batch['label']
    cnt += len(labels)
    print(f'\r[{cnt}/{N}] ', end='')
    y_true += labels
    
    with torch.no_grad():
      inputs = get_padded_batch(tokenizer, batch['sentence'], num_pads)
      logits = model(**inputs).logits.cpu()
      loss = criterion(logits, labels)
      predicted_class_ids = torch.where(torch.softmax(logits, -1) > 0.5)[1].cpu().numpy().tolist()
      
      y_pred += predicted_class_ids
      tot_loss += loss.item()


  f1 = precision_recall_fscore_support(y_true, y_pred, average='macro')
  results.append(f1)
  ce_loss.append(tot_loss)
  print(f" [{num_pads}] F1 = {f1} cross_entropy_loss = {tot_loss:.3f}")


[872/872]  [0] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[872/872]  [1] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[872/872]  [2] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[872/872]  [3] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[872/872]  [4] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[872/872]  [5] F1 = (0.9322967904170196, 0.9324219078891975, 0.9323286809959185, None) cross_entropy_loss = 2.700
[192/872] 

KeyboardInterrupt: ignored

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(results))), y=[x[2] for x in results],
                    mode='lines+markers',
                    name='F1-score'))

fig.update_layout(title = 'F1 vs pad', width=800, height=480)
fig.update_xaxes(range=(0, 514))
fig.update_yaxes(range=(0.55, 0.85))

In [None]:
from transformers import AutoTokenizer, AutoModel
name = "philschmid/MiniLM-L6-H384-uncased-sst2"
model =  AutoModel.from_pretrained(name).cuda()
tokenizer = AutoTokenizer.from_pretrained(name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of the model checkpoint at philschmid/MiniLM-L6-H384-uncased-sst2 were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/512 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics import precision_recall_fscore_support

split = 'validation'
results = []
ce_loss = []
num_sentences = []

for num_pads in range(0, 490):#model.config.max_position_embeddings):
  
  y_true = []
  y_pred = []
  tot_loss = 0
  N = len(dataset[split])
  cnt = 0
  for batch in valloader:
    
    labels = batch['label']
    cnt += len(labels)
    print(f'\r[{cnt}/{N}] ', end='')
    y_true += labels
    
    with torch.no_grad():
      inputs = get_padded_batch(tokenizer, batch['sentence'], num_pads)
      logits = model(**inputs).logits.cpu()
      loss = criterion(logits, labels)
      predicted_class_ids = torch.where(torch.softmax(logits, -1) > 0.5)[1].cpu().numpy().tolist()
      
      y_pred += predicted_class_ids
      tot_loss += loss.item()


  f1 = precision_recall_fscore_support(y_true, y_pred, average='macro')
  results.append(f1)
  ce_loss.append(tot_loss)
  print(f" [{num_pads}] F1 = {f1} cross_entropy_loss = {tot_loss:.3f}")


[64/872] 

AttributeError: ignored

In [None]:
model(**inputs)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-9.5794e-01, -6.7354e-01, -1.6538e-01,  ..., -8.3906e-01,
          -1.0104e-01, -8.3302e-01],
         [-1.3720e+00, -4.8025e-01,  1.5249e-01,  ..., -4.2311e-01,
          -5.9596e-01, -9.0729e-01],
         [-1.0705e+00, -5.6866e-01,  2.2364e-01,  ..., -5.5123e-01,
          -1.2704e-01, -1.1038e+00],
         ...,
         [-1.2487e+00, -6.9671e-01,  3.5184e-02,  ..., -5.3999e-01,
          -2.9390e-01, -8.6470e-01],
         [-1.2465e+00, -6.9471e-01,  2.5736e-02,  ..., -5.4666e-01,
          -2.6978e-01, -8.7646e-01],
         [-1.2121e+00, -6.7640e-01,  2.4536e-02,  ..., -5.3762e-01,
          -2.5569e-01, -8.9734e-01]],

        [[ 7.4991e-01,  6.3987e-01, -1.4668e-01,  ...,  6.7181e-01,
           1.0347e-01,  8.8864e-01],
         [ 6.3579e-01,  1.3327e+00,  2.1240e-01,  ...,  1.3499e-01,
          -5.0428e-01,  1.3947e+00],
         [ 2.6697e-01,  5.0692e-01, -1.1797e-01,  ...,  5.0423e-01,
           2.

# GPT Testing

## Imports

In [4]:
from transformers import GPT2Tokenizer, GPT2Model
from datasets import load_dataset
import torch

## Load model

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.51MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.53MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 1.34MB/s]
Downloading pytorch_model.bin: 100%|██████████| 548M/548M [00:05<00:00, 95.5MB/s] 


## Load dataset

In [5]:
ds = load_dataset('cnn_dailymail', '3.0.0', split='train')

Found cached dataset cnn_dailymail (/nfs/home/marquez/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


## Useful functions

In [39]:
def get_padded_inputs(tokenizer, sent, num_pad, is_gpt=False):
  tok = tokenizer(sent)
  input_ids = tok['input_ids']

  if is_gpt:
    input_ids = torch.tensor([[tokenizer.pad_token_id]*num_pad + input_ids[1:]])
    attention_mask = torch.ones(1, len(input_ids[0]))
    attention_mask[:, :1+num_pad] = 0
  else:
    input_ids = torch.tensor([[tokenizer.cls_token_id] + [tokenizer.pad_token_id]*num_pad + input_ids[1:]])
    attention_mask = torch.ones(1, len(input_ids[0]))
    attention_mask[:, 1:1+num_pad] = 0
      
  return {'input_ids':input_ids, 'attention_mask':attention_mask}

#### Testing padding function

In [35]:
# tokenizer.pad_token = '[PAD]'
# tokenizer.cls_token = '[CLS]'

tokenizer(
    [
        "hello this is a sentece",
        "another sent",
        "asodi oasiduf osaidfu oaisd fuoaisfud oaisuf oaisd ufoiasudf ois"
    ],
    padding=True
)

{'input_ids': [[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 31373, 428, 318, 257, 1908, 68, 344], [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 29214, 1908], [292, 23130, 267, 292, 312, 3046, 28686, 1698, 20942, 267, 64, 9409, 14035, 12162, 4468, 463, 267, 15152, 3046, 267, 64, 9409, 334, 6513, 4448, 463, 69, 267, 271]], 'attention_mask': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [40]:
get_padded_inputs(tokenizer, 'Hello, this is a sentence', 10, is_gpt=True)

{'input_ids': tensor([[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
             11,   428,   318,   257,  6827]]),
 'attention_mask': tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.]])}

In [29]:
tokenizer('hello, this is a sentence')

{'input_ids': [31373, 11, 428, 318, 257, 6827], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [31]:
tokenizer('ouukh hello hello you')

{'input_ids': [280, 2724, 71, 23748, 23748, 345], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [34]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.padding_side = "left" 
tokenizer.pad_token = tokenizer.eos_token # to avoid an error
model = GPT2LMHeadModel.from_pretrained('gpt2')

device = 'cuda' if torch.cuda.is_available() else 'cpu'

texts = ["this is a first prompt u u u u u u", "this is a second prompt"]
encoding = tokenizer(texts, padding=True, return_tensors='pt').to(device)
print(encoding)
with torch.no_grad():
    generated_ids = model.generate(**encoding)
generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': tensor([[ 5661,   318,   257,   717,  6152,   334,   334,   334,   334,   334,
           334],
        [50256, 50256, 50256, 50256, 50256, 50256,  5661,   318,   257,  1218,
          6152]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])}


## Inference

In [6]:
from transformers import pipeline
# use bart in pytorch
# summarizer = pipeline("summarization")
# summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)

# use t5 in tf
summarizer = pipeline("summarization", model="t5-large", tokenizer="t5-large", framework="pt")
summarizer(ds[0:2]['article'])

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': "Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has no plans to fritter his cash away . details of how he'll mark his landmark birthday are under wraps ."},
 {'summary_text': 'judge says one-third of all people in Miami-dade jails are mentally ill . mental patients often cycle through jails, hospital, only to return to jail . judge says new mental health facility will treat patients instead of jailing them .'}]

In [1]:
from transformers import T5Tokenizer

class CustomTokenizer():
    def __init__(
        self,
        name: str = 't5-small',
        num_pads: int = 10,
        is_gpt: bool = False,
        **kwargs
    ):
        self.tokenizer = T5Tokenizer.from_pretrained(name)
        self.name = name
        self.num_pad = num_pads
        self.tokenizer.pad_token = self.tokenizer.eos_token 

    def __call__(self, text, **kwargs):
        tok = self.tokenizer(text)
        input_ids = tok['input_ids']

        
        input_ids = torch.tensor([[self.tokenizer.pad_token_id]*self.num_pad + input_ids[1:]])
        attention_mask = torch.ones(1, len(input_ids[0]))
        attention_mask[:, :1+self.num_pad] = 0
            
        return {'input_ids':input_ids, 'attention_mask':attention_mask}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class CustomTokenizer(T5Tokenizer):
    _num_pads: int = 50

@property
def num_pads(self):
    return self._num_pads

@num_pads.setter
def num_pads(self, value):
    self._num_pads = value

def __init__(self, **kwargs):
    super().__init__(**kwargs)

def __call__(self, *args, **kwargs):
    print('hello_________________')
    tok = super().__call__(*args, **kwargs)
    input_ids = tok['input_ids']

    
    input_ids = torch.tensor([[super().pad_token_id]*self.num_pads + input_ids[1:]])
    attention_mask = torch.ones(1, len(input_ids[0]))
    attention_mask[:, :1+self.num_pads] = 0
        
    return {'input_ids':input_ids, 'attention_mask':attention_mask}

In [3]:
ct = CustomTokenizer.from_pretrained('t5-small')
ct('Harry Potter star Daniel Radcliffe turns 18 on monday . the young actor says he has')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'CustomTokenizer'.


{'input_ids': [8929, 16023, 2213, 4173, 6324, 12591, 15, 5050, 507, 30, 1911, 1135, 3, 5, 8, 1021, 7556, 845, 3, 88, 65, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [42]:
summarizer = pipeline("summarization", model="t5-large", tokenizer=ct, framework="pt")
summarizer(ds[0:2]['article'])

Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


AttributeError: 'CustomTokenizer' object has no attribute 'decode'