In [1]:
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("unicamp-dl/ptt5-base-portuguese-vocab")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
from transformers import GPTJConfig, GPTJForCausalLM
import json

json_file = open('model/config.json')
config = json.load(json_file)

config = GPTJConfig(**config)
model = GPTJForCausalLM(config)

In [3]:
from datasets import load_dataset

dataset = load_dataset("assin2", cache_dir="dataset/assin2")

Downloading data:   0%|          | 0.00/376k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/157k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.1k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

In [4]:
train_data = dataset['train']['premise']

In [5]:
train_data_tokenized = tokenizer.batch_encode_plus(
    train_data,
    max_length=model.config.n_positions,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
)

In [6]:
import torch

# output
train_data_tokenized['label'] = train_data_tokenized['input_ids'][:, 1:13]
# train_data_tokenized['label'] = torch.cat((train_data_tokenized['label'], torch.ones(train_data_tokenized['label'].shape[0], 1, dtype=torch.int64)*tokenizer.eos_token_id), dim=1)

# input
train_data_tokenized['input_ids'] = train_data_tokenized['input_ids'][:, 0:12]
# train_data_tokenized['input_ids'] = torch.cat((train_data_tokenized['input_ids'], torch.ones(train_data_tokenized['input_ids'].shape[0], 1, dtype=torch.int64)*tokenizer.eos_token_id), dim=1)
train_data_tokenized['attention_mask'] = train_data_tokenized['attention_mask'][:, 0:12]
# train_data_tokenized['attention_mask'] = torch.cat((train_data_tokenized['attention_mask'], torch.ones(train_data_tokenized['attention_mask'].shape[0], 1, dtype=torch.int64)), dim=1)

### training loop

In [7]:
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

class GPT2Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = GPT2Dataset(train_data_tokenized)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=False)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [11]:
# import cross entropy
_batch = next(iter(train_loader))

for epoch in range(5):
    model.train()
    loop = tqdm(train_loader)
    for _ in loop:
        input_ids = _batch['input_ids'].to(device)
        attention_mask = _batch['attention_mask'].to(device)
        labels = _batch['label'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 3250/3250 [00:47<00:00, 69.14it/s, loss=2.36e-6]
Epoch 1: 100%|██████████| 3250/3250 [00:46<00:00, 69.74it/s, loss=3.31e-7]
Epoch 2:   8%|▊         | 257/3250 [00:03<00:41, 71.58it/s, loss=2.87e-7]


KeyboardInterrupt: 

In [12]:
# _text = 'Uma criança risonha está'
# _input = tokenizer.batch_encode_plus([_text], return_tensors="pt",)['input_ids']
# _input = torch.cat((_input, torch.ones(_input.shape[0], 1, dtype=torch.int64)*tokenizer.eos_token_id), dim=1).to(device)

_input = _batch['input_ids'].to(device)
output = model.generate(input_ids=_input, max_length=20 , num_beams=5, no_repeat_ngram_size=2)
print(tokenizer.decode(output[0], skip_special_tokens=True))
print('input = ', tokenizer.decode(_batch['input_ids'][0], skip_special_tokens=True))
print('output = ', tokenizer.decode(_batch['label'][0], skip_special_tokens=True))

Uma criança risonha está segurando uma pistola de água emaha segurando pistola água sendo portama
input =  Uma criança risonha está segurando uma pistola de água e
output =  criança risonha está segurando uma pistola de água e sendo
