In [4]:
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import EncoderDecoderModel
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader

import nltk
from nltk.translate import bleu_score
from nltk.translate.bleu_score import corpus_bleu

import pandas as pd
import numpy as np
import time
from tqdm import tqdm

In [5]:
# setup device based on availability
if torch.cuda.is_available():
    print("Total GPUs: ", torch.cuda.device_count())
device = "cuda:4" if torch.cuda.is_available() else "cpu"
print("Using device: {}".format(device))

Total GPUs:  6
Using device: cuda:4


In [6]:
# load the dataset
train_source_text_path = "../data/sup_train.en-fr.fr"
train_target_text_path = "../data/sup_train.en-fr.en"
dev_source_text_path = "../data/sup_valid.en-fr.fr"
dev_target_text_path = "../data/sup_valid.en-fr.en"
test_source_text_path = "../data/test.en-fr.fr"
test_target_text_path = "../data/test.en-fr.en"

In [7]:
# read the data from the files and store them in pandas dataframe
def retrieve_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            line = line.strip()
            data.append(line)
    return data

In [8]:
enc_tokenizer = DistilBertTokenizer.from_pretrained('/soe/npullabh/244_final_project/gpt2/tryout/bert2bert_enc_tokenizer')
dec_tokenizer = GPT2Tokenizer.from_pretrained('/soe/npullabh/244_final_project/gpt2/tryout/bert2bert_dec_tokenizer')
dec_tokenizer.pad_token = dec_tokenizer.eos_token

encoder = DistilBertModel.from_pretrained('/soe/npullabh/244_final_project/gpt2/tryout/bert2bert_encoder.pt')
decoder = GPT2LMHeadModel.from_pretrained('/soe/npullabh/244_final_project/gpt2/tryout/bert2bert_decoder.pt')
model = EncoderDecoderModel(encoder=encoder, decoder=decoder).to(device)

In [9]:
# tokenize the data
def encode_data(source_sentences, target_sentences):
    tokenized_inputs = enc_tokenizer.batch_encode_plus(source_sentences, padding='longest', return_tensors='pt')
    tokenized_outputs = dec_tokenizer.batch_encode_plus(target_sentences, padding='longest', return_tensors='pt')
    
    input_ids = tokenized_inputs['input_ids']
    input_attention_mask = tokenized_inputs['attention_mask']
    labels = tokenized_outputs['input_ids']
    output_attention_mask = tokenized_outputs['attention_mask']
    
    return input_ids, input_attention_mask, labels, output_attention_mask

In [10]:
test_source_sentences = retrieve_data(test_source_text_path)
test_target_sentences = retrieve_data(test_target_text_path)

In [11]:
batch_size = 16
test_input_ids, test_attn_mask, test_labels, output_attention_masks = encode_data(test_source_sentences[:16], 
                                                          test_target_sentences[:16])
test_dataset = TensorDataset(test_input_ids, test_attn_mask, test_labels, output_attention_masks)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [12]:
# check if the batching works
for _, batch in zip(range(5), test_data_loader):
    print(batch[0].shape, batch[1].shape, batch[2].shape, batch[3].shape)

torch.Size([16, 139]) torch.Size([16, 139]) torch.Size([16, 59]) torch.Size([16, 59])


In [16]:
def evaluate(model, loader, tokenizer):
    total_loss = 0
    all_predictions = []
    all_targets = []
    all_prefixes = []
    
    model.eval()
    with torch.no_grad():
        for index, batch in tqdm(enumerate(loader), desc="Evaluating...", total=len(loader)):
            X = batch[0].to(device)
            attention_mask = batch[1].to(device)
            y = batch[2].to(device)
            decoder_attention_mask = batch[3].to(device)
            
            y_prefix = y[:, :15]
            decoder_attention_mask_prefix = decoder_attention_mask[:, :15]
            
            print(y.shape, decoder_attention_mask.shape)
            
            outputs = model.generate(inputs=X, max_length=18, decoder_input_ids=y_prefix, decoder_attention_mask=decoder_attention_mask_prefix)
            
            predicted_sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            target_sentences = [tokenizer.decode(target, skip_special_tokens=True) for target in y]
            prefixes = [tokenizer.decode(pfx, skip_special_tokens=True) for pfx in y_prefix]
            all_predictions.extend(predicted_sentences)
            all_targets.extend(target_sentences)
            all_prefixes.extend(prefixes)
            
            loss = model(input_ids=X, decoder_input_ids=y, labels=y).loss
            total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    
    return avg_loss, all_predictions, all_targets, all_prefixes

In [17]:
test_loss, test_predictions, test_targets, all_prefixes = evaluate(model, test_data_loader, dec_tokenizer)

Evaluating...:   0%|                                      | 0/1 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Evaluating...: 100%|██████████████████████████████| 1/1 [00:00<00:00,  5.04it/s]


torch.Size([16, 59]) torch.Size([16, 59])


In [18]:
for i in range(len(test_predictions)):
    print("Prefix : ", all_prefixes[i])
    print("Pred   : ", test_predictions[i])
    print("Label  : ", test_targets[i])
    print("-----------------")

Prefix :  the lega nord in italy, the vlaams blok
Pred   :  the lega nord in italy, the vlaams blokokokok
Label  :  the lega nord in italy, the vlaams blok in the netherlands, the supporters of le pen's national front in france, are all examples of parties or movements formed on the common theme of aversion to immigrants and promotion of simplistic policies to control them.
-----------------
Prefix :  this does not mean that the answer is to eliminate heterogeneity and create racially hom
Pred   :  this does not mean that the answer is to eliminate heterogeneity and create racially hom hom hom hom
Label  :  this does not mean that the answer is to eliminate heterogeneity and create racially homogenous communities, but an acknowledgment of the reality of these issues is needed in order to start constructing solid public policies toward race relations.
-----------------
Prefix :  some favor affirmative action programs that provide preferences for minorities in job allocation, college
Pre

In [24]:
src = ["la maison raciale divisée de l' europe"]
trg_prefix = ["europe 's "]
source_input_ids = enc_tokenizer(src)
target_input_ids = dec_tokenizer(trg_prefix)

print((source_input_ids), (target_input_ids))

generated = model.generate(inputs=source_input_ids.input_ids, max_length=15, decoder_input_ids=target_input_ids.input_ids)
print(generated)
print(tokenizer.decode(generated[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'input_ids': [[101, 2474, 26420, 5762, 2063, 4487, 11365, 4402, 2139, 1048, 1005, 2885, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} {'input_ids': [[44252, 431, 705, 82, 220]], 'attention_mask': [[1, 1, 1, 1, 1]]}


AttributeError: 'list' object has no attribute 'shape'

In [29]:
from transformers import EncoderDecoderModel

model = EncoderDecoderModel.from_pretrained('t5-small')

input_ids = model.encoder(['translate English to German: How are you doing today?'], return_tensors='pt').input_ids
decoder_input_ids = model.decoder(['<pad>'], return_tensors='pt').input_ids
decoder_attention_mask = torch.ones(decoder_input_ids.shape, dtype=torch.long, device=model.device) # attention mask for the decoder_input_ids

generated_ids = model.generate(
    input_ids=input_ids,
    decoder_input_ids=decoder_input_ids,
    attention_mask=decoder_attention_mask,
    temperature=1.0,
    top_k=50,
    top_p=0.95,
    max_length=100,
    do_sample=True,
)

generated_text = model.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

You are using a model of type t5 to instantiate a model of type encoder-decoder. This is not supported for all configurations of models and can yield errors.


AssertionError: Config has to be initialized with encoder and decoder config

In [3]:
from transformers import AutoTokenizer, MT5Model
tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
model = MT5Model.from_pretrained("google/mt5-small")
input_ids = tokenizer(
    "Studies have been shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
# preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model.
# This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg.
decoder_input_ids = model._shift_right(decoder_input_ids)
# forward pass
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
#print(outputs)
print(tokenizer.decode(outputs.last_hidden_state))

Some weights of the model checkpoint at google/mt5-small were not used when initializing MT5Model: ['lm_head.weight']
- This IS expected if you are initializing MT5Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MT5Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: argument 'ids': 'list' object cannot be interpreted as an integer